diff --git a/clang/include/clang/AST/ExprConcepts.h b/clang/include/clang/AST/ExprConcepts.h index d900e980852b..13d4568119eb 100644 --- a/clang/include/clang/AST/ExprConcepts.h +++ b/clang/include/clang/AST/ExprConcepts.h @@ -1,567 +1,575 @@ //===- ExprConcepts.h - C++2a Concepts expressions --------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// Defines Expressions and AST nodes for C++2a concepts. // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_AST_EXPRCONCEPTS_H #define LLVM_CLANG_AST_EXPRCONCEPTS_H -#include "clang/AST/ASTContext.h" #include "clang/AST/ASTConcept.h" +#include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" -#include "clang/AST/DeclarationName.h" #include "clang/AST/DeclTemplate.h" +#include "clang/AST/DeclarationName.h" #include "clang/AST/Expr.h" #include "clang/AST/NestedNameSpecifier.h" #include "clang/AST/TemplateBase.h" #include "clang/AST/Type.h" #include "clang/Basic/SourceLocation.h" +#include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TrailingObjects.h" -#include #include +#include namespace clang { class ASTStmtReader; class ASTStmtWriter; /// \brief Represents the specialization of a concept - evaluates to a prvalue /// of type bool. /// /// According to C++2a [expr.prim.id]p3 an id-expression that denotes the /// specialization of a concept results in a prvalue of type bool. class ConceptSpecializationExpr final : public Expr, public ConceptReference { friend class ASTReader; friend class ASTStmtReader; public: using SubstitutionDiagnostic = std::pair; protected: /// \brief The Implicit Concept Specialization Decl, which holds the template /// arguments for this specialization. ImplicitConceptSpecializationDecl *SpecDecl; /// \brief Information about the satisfaction of the named concept with the /// given arguments. If this expression is value dependent, this is to be /// ignored. ASTConstraintSatisfaction *Satisfaction; ConceptSpecializationExpr(const ASTContext &C, NestedNameSpecifierLoc NNS, SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo, NamedDecl *FoundDecl, ConceptDecl *NamedConcept, const ASTTemplateArgumentListInfo *ArgsAsWritten, ImplicitConceptSpecializationDecl *SpecDecl, const ConstraintSatisfaction *Satisfaction); ConceptSpecializationExpr(const ASTContext &C, ConceptDecl *NamedConcept, const ASTTemplateArgumentListInfo *ArgsAsWritten, ImplicitConceptSpecializationDecl *SpecDecl, const ConstraintSatisfaction *Satisfaction, bool Dependent, bool ContainsUnexpandedParameterPack); ConceptSpecializationExpr(EmptyShell Empty); public: static ConceptSpecializationExpr * Create(const ASTContext &C, NestedNameSpecifierLoc NNS, SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo, NamedDecl *FoundDecl, ConceptDecl *NamedConcept, const ASTTemplateArgumentListInfo *ArgsAsWritten, ImplicitConceptSpecializationDecl *SpecDecl, const ConstraintSatisfaction *Satisfaction); static ConceptSpecializationExpr * Create(const ASTContext &C, ConceptDecl *NamedConcept, const ASTTemplateArgumentListInfo *ArgsAsWritten, ImplicitConceptSpecializationDecl *SpecDecl, const ConstraintSatisfaction *Satisfaction, bool Dependent, bool ContainsUnexpandedParameterPack); ArrayRef getTemplateArguments() const { return SpecDecl->getTemplateArguments(); } const ImplicitConceptSpecializationDecl *getSpecializationDecl() const { assert(SpecDecl && "Template Argument Decl not initialized"); return SpecDecl; } /// \brief Whether or not the concept with the given arguments was satisfied /// when the expression was created. /// The expression must not be dependent. bool isSatisfied() const { assert(!isValueDependent() && "isSatisfied called on a dependent ConceptSpecializationExpr"); return Satisfaction->IsSatisfied; } /// \brief Get elaborated satisfaction info about the template arguments' /// satisfaction of the named concept. /// The expression must not be dependent. const ASTConstraintSatisfaction &getSatisfaction() const { assert(!isValueDependent() && "getSatisfaction called on dependent ConceptSpecializationExpr"); return *Satisfaction; } static bool classof(const Stmt *T) { return T->getStmtClass() == ConceptSpecializationExprClass; } SourceLocation getBeginLoc() const LLVM_READONLY { if (auto QualifierLoc = getNestedNameSpecifierLoc()) return QualifierLoc.getBeginLoc(); return ConceptName.getBeginLoc(); } SourceLocation getEndLoc() const LLVM_READONLY { // If the ConceptSpecializationExpr is the ImmediatelyDeclaredConstraint // of a TypeConstraint written syntactically as a constrained-parameter, // there may not be a template argument list. return ArgsAsWritten->RAngleLoc.isValid() ? ArgsAsWritten->RAngleLoc : ConceptName.getEndLoc(); } // Iterators child_range children() { return child_range(child_iterator(), child_iterator()); } const_child_range children() const { return const_child_range(const_child_iterator(), const_child_iterator()); } }; namespace concepts { /// \brief A static requirement that can be used in a requires-expression to /// check properties of types and expression. class Requirement { public: // Note - simple and compound requirements are both represented by the same // class (ExprRequirement). enum RequirementKind { RK_Type, RK_Simple, RK_Compound, RK_Nested }; private: const RequirementKind Kind; // FIXME: use RequirementDependence to model dependence? bool Dependent : 1; bool ContainsUnexpandedParameterPack : 1; bool Satisfied : 1; public: struct SubstitutionDiagnostic { StringRef SubstitutedEntity; // FIXME: Store diagnostics semantically and not as prerendered strings. // Fixing this probably requires serialization of PartialDiagnostic // objects. SourceLocation DiagLoc; StringRef DiagMessage; }; Requirement(RequirementKind Kind, bool IsDependent, bool ContainsUnexpandedParameterPack, bool IsSatisfied = true) : Kind(Kind), Dependent(IsDependent), ContainsUnexpandedParameterPack(ContainsUnexpandedParameterPack), Satisfied(IsSatisfied) {} RequirementKind getKind() const { return Kind; } bool isSatisfied() const { assert(!Dependent && "isSatisfied can only be called on non-dependent requirements."); return Satisfied; } void setSatisfied(bool IsSatisfied) { assert(!Dependent && "setSatisfied can only be called on non-dependent requirements."); Satisfied = IsSatisfied; } void setDependent(bool IsDependent) { Dependent = IsDependent; } bool isDependent() const { return Dependent; } void setContainsUnexpandedParameterPack(bool Contains) { ContainsUnexpandedParameterPack = Contains; } bool containsUnexpandedParameterPack() const { return ContainsUnexpandedParameterPack; } }; /// \brief A requires-expression requirement which queries the existence of a /// type name or type template specialization ('type' requirements). class TypeRequirement : public Requirement { public: enum SatisfactionStatus { SS_Dependent, SS_SubstitutionFailure, SS_Satisfied }; private: llvm::PointerUnion Value; SatisfactionStatus Status; public: friend ASTStmtReader; friend ASTStmtWriter; /// \brief Construct a type requirement from a type. If the given type is not /// dependent, this indicates that the type exists and the requirement will be /// satisfied. Otherwise, the SubstitutionDiagnostic constructor is to be /// used. TypeRequirement(TypeSourceInfo *T); /// \brief Construct a type requirement when the nested name specifier is /// invalid due to a bad substitution. The requirement is unsatisfied. TypeRequirement(SubstitutionDiagnostic *Diagnostic) : Requirement(RK_Type, false, false, false), Value(Diagnostic), Status(SS_SubstitutionFailure) {} SatisfactionStatus getSatisfactionStatus() const { return Status; } void setSatisfactionStatus(SatisfactionStatus Status) { this->Status = Status; } bool isSubstitutionFailure() const { return Status == SS_SubstitutionFailure; } SubstitutionDiagnostic *getSubstitutionDiagnostic() const { assert(Status == SS_SubstitutionFailure && "Attempted to get substitution diagnostic when there has been no " "substitution failure."); return Value.get(); } TypeSourceInfo *getType() const { assert(!isSubstitutionFailure() && "Attempted to get type when there has been a substitution failure."); return Value.get(); } static bool classof(const Requirement *R) { return R->getKind() == RK_Type; } }; /// \brief A requires-expression requirement which queries the validity and /// properties of an expression ('simple' and 'compound' requirements). class ExprRequirement : public Requirement { public: enum SatisfactionStatus { SS_Dependent, SS_ExprSubstitutionFailure, SS_NoexceptNotMet, SS_TypeRequirementSubstitutionFailure, SS_ConstraintsNotSatisfied, SS_Satisfied }; class ReturnTypeRequirement { llvm::PointerIntPair< llvm::PointerUnion, 1, bool> TypeConstraintInfo; public: friend ASTStmtReader; friend ASTStmtWriter; /// \brief No return type requirement was specified. ReturnTypeRequirement() : TypeConstraintInfo(nullptr, false) {} /// \brief A return type requirement was specified but it was a /// substitution failure. ReturnTypeRequirement(SubstitutionDiagnostic *SubstDiag) : TypeConstraintInfo(SubstDiag, false) {} /// \brief A 'type constraint' style return type requirement. /// \param TPL an invented template parameter list containing a single /// type parameter with a type-constraint. // TODO: Can we maybe not save the whole template parameter list and just // the type constraint? Saving the whole TPL makes it easier to handle in // serialization but is less elegant. ReturnTypeRequirement(TemplateParameterList *TPL); bool isDependent() const { return TypeConstraintInfo.getInt(); } bool containsUnexpandedParameterPack() const { if (!isTypeConstraint()) return false; return getTypeConstraintTemplateParameterList() ->containsUnexpandedParameterPack(); } bool isEmpty() const { return TypeConstraintInfo.getPointer().isNull(); } bool isSubstitutionFailure() const { return !isEmpty() && TypeConstraintInfo.getPointer().is(); } bool isTypeConstraint() const { return !isEmpty() && TypeConstraintInfo.getPointer().is(); } SubstitutionDiagnostic *getSubstitutionDiagnostic() const { assert(isSubstitutionFailure()); return TypeConstraintInfo.getPointer().get(); } const TypeConstraint *getTypeConstraint() const; TemplateParameterList *getTypeConstraintTemplateParameterList() const { assert(isTypeConstraint()); return TypeConstraintInfo.getPointer().get(); } }; private: llvm::PointerUnion Value; SourceLocation NoexceptLoc; // May be empty if noexcept wasn't specified. ReturnTypeRequirement TypeReq; ConceptSpecializationExpr *SubstitutedConstraintExpr; SatisfactionStatus Status; public: friend ASTStmtReader; friend ASTStmtWriter; /// \brief Construct a compound requirement. /// \param E the expression which is checked by this requirement. /// \param IsSimple whether this was a simple requirement in source. /// \param NoexceptLoc the location of the noexcept keyword, if it was /// specified, otherwise an empty location. /// \param Req the requirement for the type of the checked expression. /// \param Status the satisfaction status of this requirement. ExprRequirement( Expr *E, bool IsSimple, SourceLocation NoexceptLoc, ReturnTypeRequirement Req, SatisfactionStatus Status, ConceptSpecializationExpr *SubstitutedConstraintExpr = nullptr); /// \brief Construct a compound requirement whose expression was a /// substitution failure. The requirement is not satisfied. /// \param E the diagnostic emitted while instantiating the original /// expression. /// \param IsSimple whether this was a simple requirement in source. /// \param NoexceptLoc the location of the noexcept keyword, if it was /// specified, otherwise an empty location. /// \param Req the requirement for the type of the checked expression (omit /// if no requirement was specified). ExprRequirement(SubstitutionDiagnostic *E, bool IsSimple, SourceLocation NoexceptLoc, ReturnTypeRequirement Req = {}); bool isSimple() const { return getKind() == RK_Simple; } bool isCompound() const { return getKind() == RK_Compound; } bool hasNoexceptRequirement() const { return NoexceptLoc.isValid(); } SourceLocation getNoexceptLoc() const { return NoexceptLoc; } SatisfactionStatus getSatisfactionStatus() const { return Status; } bool isExprSubstitutionFailure() const { return Status == SS_ExprSubstitutionFailure; } const ReturnTypeRequirement &getReturnTypeRequirement() const { return TypeReq; } ConceptSpecializationExpr * getReturnTypeRequirementSubstitutedConstraintExpr() const { assert(Status >= SS_TypeRequirementSubstitutionFailure); return SubstitutedConstraintExpr; } SubstitutionDiagnostic *getExprSubstitutionDiagnostic() const { assert(isExprSubstitutionFailure() && "Attempted to get expression substitution diagnostic when there has " "been no expression substitution failure"); return Value.get(); } Expr *getExpr() const { assert(!isExprSubstitutionFailure() && "ExprRequirement has no expression because there has been a " "substitution failure."); return Value.get(); } static bool classof(const Requirement *R) { return R->getKind() == RK_Compound || R->getKind() == RK_Simple; } }; /// \brief A requires-expression requirement which is satisfied when a general /// constraint expression is satisfied ('nested' requirements). class NestedRequirement : public Requirement { Expr *Constraint = nullptr; const ASTConstraintSatisfaction *Satisfaction = nullptr; bool HasInvalidConstraint = false; StringRef InvalidConstraintEntity; public: friend ASTStmtReader; friend ASTStmtWriter; NestedRequirement(Expr *Constraint) : Requirement(RK_Nested, /*IsDependent=*/true, Constraint->containsUnexpandedParameterPack()), Constraint(Constraint) { assert(Constraint->isInstantiationDependent() && "Nested requirement with non-dependent constraint must be " "constructed with a ConstraintSatisfaction object"); } NestedRequirement(ASTContext &C, Expr *Constraint, const ConstraintSatisfaction &Satisfaction) : Requirement(RK_Nested, Constraint->isInstantiationDependent(), Constraint->containsUnexpandedParameterPack(), Satisfaction.IsSatisfied), Constraint(Constraint), Satisfaction(ASTConstraintSatisfaction::Create(C, Satisfaction)) {} NestedRequirement(StringRef InvalidConstraintEntity, const ASTConstraintSatisfaction *Satisfaction) : Requirement(RK_Nested, /*IsDependent=*/false, /*ContainsUnexpandedParameterPack*/ false, Satisfaction->IsSatisfied), Satisfaction(Satisfaction), HasInvalidConstraint(true), InvalidConstraintEntity(InvalidConstraintEntity) {} NestedRequirement(ASTContext &C, StringRef InvalidConstraintEntity, const ConstraintSatisfaction &Satisfaction) : NestedRequirement(InvalidConstraintEntity, ASTConstraintSatisfaction::Create(C, Satisfaction)) {} bool hasInvalidConstraint() const { return HasInvalidConstraint; } StringRef getInvalidConstraintEntity() { assert(hasInvalidConstraint()); return InvalidConstraintEntity; } Expr *getConstraintExpr() const { assert(!hasInvalidConstraint() && "getConstraintExpr() may not be called " "on nested requirements with invalid constraint."); return Constraint; } const ASTConstraintSatisfaction &getConstraintSatisfaction() const { return *Satisfaction; } static bool classof(const Requirement *R) { return R->getKind() == RK_Nested; } }; +using EntityPrinter = llvm::function_ref; + +/// \brief create a Requirement::SubstitutionDiagnostic with only a +/// SubstitutedEntity and DiagLoc using Sema's allocator. +Requirement::SubstitutionDiagnostic * +createSubstDiagAt(Sema &S, SourceLocation Location, EntityPrinter Printer); + } // namespace concepts /// C++2a [expr.prim.req]: /// A requires-expression provides a concise way to express requirements on /// template arguments. A requirement is one that can be checked by name /// lookup (6.4) or by checking properties of types and expressions. /// [...] /// A requires-expression is a prvalue of type bool [...] class RequiresExpr final : public Expr, llvm::TrailingObjects { friend TrailingObjects; friend class ASTStmtReader; unsigned NumLocalParameters; unsigned NumRequirements; RequiresExprBodyDecl *Body; SourceLocation RBraceLoc; unsigned numTrailingObjects(OverloadToken) const { return NumLocalParameters; } unsigned numTrailingObjects(OverloadToken) const { return NumRequirements; } RequiresExpr(ASTContext &C, SourceLocation RequiresKWLoc, RequiresExprBodyDecl *Body, ArrayRef LocalParameters, ArrayRef Requirements, SourceLocation RBraceLoc); RequiresExpr(ASTContext &C, EmptyShell Empty, unsigned NumLocalParameters, unsigned NumRequirements); public: static RequiresExpr * Create(ASTContext &C, SourceLocation RequiresKWLoc, RequiresExprBodyDecl *Body, ArrayRef LocalParameters, ArrayRef Requirements, SourceLocation RBraceLoc); static RequiresExpr * Create(ASTContext &C, EmptyShell Empty, unsigned NumLocalParameters, unsigned NumRequirements); ArrayRef getLocalParameters() const { return {getTrailingObjects(), NumLocalParameters}; } RequiresExprBodyDecl *getBody() const { return Body; } ArrayRef getRequirements() const { return {getTrailingObjects(), NumRequirements}; } /// \brief Whether or not the requires clause is satisfied. /// The expression must not be dependent. bool isSatisfied() const { assert(!isValueDependent() && "isSatisfied called on a dependent RequiresExpr"); return RequiresExprBits.IsSatisfied; } void setSatisfied(bool IsSatisfied) { assert(!isValueDependent() && "setSatisfied called on a dependent RequiresExpr"); RequiresExprBits.IsSatisfied = IsSatisfied; } SourceLocation getRequiresKWLoc() const { return RequiresExprBits.RequiresKWLoc; } SourceLocation getRBraceLoc() const { return RBraceLoc; } static bool classof(const Stmt *T) { return T->getStmtClass() == RequiresExprClass; } SourceLocation getBeginLoc() const LLVM_READONLY { return RequiresExprBits.RequiresKWLoc; } SourceLocation getEndLoc() const LLVM_READONLY { return RBraceLoc; } // Iterators child_range children() { return child_range(child_iterator(), child_iterator()); } const_child_range children() const { return const_child_range(const_child_iterator(), const_child_iterator()); } }; } // namespace clang #endif // LLVM_CLANG_AST_EXPRCONCEPTS_H diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index a3506df7d4e5..f09d1129b128 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1,7476 +1,7476 @@ //===--- CodeGenModule.cpp - Emit LLVM Code from ASTs for a Module --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This coordinates the per-module state used while generating code. // //===----------------------------------------------------------------------===// #include "CodeGenModule.h" #include "ABIInfo.h" #include "CGBlocks.h" #include "CGCUDARuntime.h" #include "CGCXXABI.h" #include "CGCall.h" #include "CGDebugInfo.h" #include "CGHLSLRuntime.h" #include "CGObjCRuntime.h" #include "CGOpenCLRuntime.h" #include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "CodeGenPGO.h" #include "ConstantEmitter.h" #include "CoverageMappingGen.h" #include "TargetInfo.h" #include "clang/AST/ASTContext.h" #include "clang/AST/CharUnits.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Mangle.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/StmtVisitor.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/CodeGenOptions.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/FileManager.h" #include "clang/Basic/Module.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/Version.h" #include "clang/CodeGen/BackendUtil.h" #include "clang/CodeGen/ConstantInitBuilder.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/CRC.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/Triple.h" #include "llvm/TargetParser/X86TargetParser.h" #include using namespace clang; using namespace CodeGen; static llvm::cl::opt LimitedCoverage( "limited-coverage-experimental", llvm::cl::Hidden, llvm::cl::desc("Emit limited coverage mapping information (experimental)")); static const char AnnotationSection[] = "llvm.metadata"; static CGCXXABI *createCXXABI(CodeGenModule &CGM) { switch (CGM.getContext().getCXXABIKind()) { case TargetCXXABI::AppleARM64: case TargetCXXABI::Fuchsia: case TargetCXXABI::GenericAArch64: case TargetCXXABI::GenericARM: case TargetCXXABI::iOS: case TargetCXXABI::WatchOS: case TargetCXXABI::GenericMIPS: case TargetCXXABI::GenericItanium: case TargetCXXABI::WebAssembly: case TargetCXXABI::XL: return CreateItaniumCXXABI(CGM); case TargetCXXABI::Microsoft: return CreateMicrosoftCXXABI(CGM); } llvm_unreachable("invalid C++ ABI kind"); } static std::unique_ptr createTargetCodeGenInfo(CodeGenModule &CGM) { const TargetInfo &Target = CGM.getTarget(); const llvm::Triple &Triple = Target.getTriple(); const CodeGenOptions &CodeGenOpts = CGM.getCodeGenOpts(); switch (Triple.getArch()) { default: return createDefaultTargetCodeGenInfo(CGM); case llvm::Triple::le32: return createPNaClTargetCodeGenInfo(CGM); case llvm::Triple::m68k: return createM68kTargetCodeGenInfo(CGM); case llvm::Triple::mips: case llvm::Triple::mipsel: if (Triple.getOS() == llvm::Triple::NaCl) return createPNaClTargetCodeGenInfo(CGM); return createMIPSTargetCodeGenInfo(CGM, /*IsOS32=*/true); case llvm::Triple::mips64: case llvm::Triple::mips64el: return createMIPSTargetCodeGenInfo(CGM, /*IsOS32=*/false); case llvm::Triple::avr: { // For passing parameters, R8~R25 are used on avr, and R18~R25 are used // on avrtiny. For passing return value, R18~R25 are used on avr, and // R22~R25 are used on avrtiny. unsigned NPR = Target.getABI() == "avrtiny" ? 6 : 18; unsigned NRR = Target.getABI() == "avrtiny" ? 4 : 8; return createAVRTargetCodeGenInfo(CGM, NPR, NRR); } case llvm::Triple::aarch64: case llvm::Triple::aarch64_32: case llvm::Triple::aarch64_be: { AArch64ABIKind Kind = AArch64ABIKind::AAPCS; if (Target.getABI() == "darwinpcs") Kind = AArch64ABIKind::DarwinPCS; else if (Triple.isOSWindows()) return createWindowsAArch64TargetCodeGenInfo(CGM, AArch64ABIKind::Win64); return createAArch64TargetCodeGenInfo(CGM, Kind); } case llvm::Triple::wasm32: case llvm::Triple::wasm64: { WebAssemblyABIKind Kind = WebAssemblyABIKind::MVP; if (Target.getABI() == "experimental-mv") Kind = WebAssemblyABIKind::ExperimentalMV; return createWebAssemblyTargetCodeGenInfo(CGM, Kind); } case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: { if (Triple.getOS() == llvm::Triple::Win32) return createWindowsARMTargetCodeGenInfo(CGM, ARMABIKind::AAPCS_VFP); ARMABIKind Kind = ARMABIKind::AAPCS; StringRef ABIStr = Target.getABI(); if (ABIStr == "apcs-gnu") Kind = ARMABIKind::APCS; else if (ABIStr == "aapcs16") Kind = ARMABIKind::AAPCS16_VFP; else if (CodeGenOpts.FloatABI == "hard" || (CodeGenOpts.FloatABI != "soft" && (Triple.getEnvironment() == llvm::Triple::GNUEABIHF || Triple.getEnvironment() == llvm::Triple::MuslEABIHF || Triple.getEnvironment() == llvm::Triple::EABIHF))) Kind = ARMABIKind::AAPCS_VFP; return createARMTargetCodeGenInfo(CGM, Kind); } case llvm::Triple::ppc: { if (Triple.isOSAIX()) return createAIXTargetCodeGenInfo(CGM, /*Is64Bit=*/false); bool IsSoftFloat = CodeGenOpts.FloatABI == "soft" || Target.hasFeature("spe"); return createPPC32TargetCodeGenInfo(CGM, IsSoftFloat); } case llvm::Triple::ppcle: { bool IsSoftFloat = CodeGenOpts.FloatABI == "soft"; return createPPC32TargetCodeGenInfo(CGM, IsSoftFloat); } case llvm::Triple::ppc64: if (Triple.isOSAIX()) return createAIXTargetCodeGenInfo(CGM, /*Is64Bit=*/true); if (Triple.isOSBinFormatELF()) { PPC64_SVR4_ABIKind Kind = PPC64_SVR4_ABIKind::ELFv1; if (Target.getABI() == "elfv2") Kind = PPC64_SVR4_ABIKind::ELFv2; bool IsSoftFloat = CodeGenOpts.FloatABI == "soft"; return createPPC64_SVR4_TargetCodeGenInfo(CGM, Kind, IsSoftFloat); } return createPPC64TargetCodeGenInfo(CGM); case llvm::Triple::ppc64le: { assert(Triple.isOSBinFormatELF() && "PPC64 LE non-ELF not supported!"); PPC64_SVR4_ABIKind Kind = PPC64_SVR4_ABIKind::ELFv2; if (Target.getABI() == "elfv1") Kind = PPC64_SVR4_ABIKind::ELFv1; bool IsSoftFloat = CodeGenOpts.FloatABI == "soft"; return createPPC64_SVR4_TargetCodeGenInfo(CGM, Kind, IsSoftFloat); } case llvm::Triple::nvptx: case llvm::Triple::nvptx64: return createNVPTXTargetCodeGenInfo(CGM); case llvm::Triple::msp430: return createMSP430TargetCodeGenInfo(CGM); case llvm::Triple::riscv32: case llvm::Triple::riscv64: { StringRef ABIStr = Target.getABI(); unsigned XLen = Target.getPointerWidth(LangAS::Default); unsigned ABIFLen = 0; if (ABIStr.endswith("f")) ABIFLen = 32; else if (ABIStr.endswith("d")) ABIFLen = 64; return createRISCVTargetCodeGenInfo(CGM, XLen, ABIFLen); } case llvm::Triple::systemz: { bool SoftFloat = CodeGenOpts.FloatABI == "soft"; bool HasVector = !SoftFloat && Target.getABI() == "vector"; return createSystemZTargetCodeGenInfo(CGM, HasVector, SoftFloat); } case llvm::Triple::tce: case llvm::Triple::tcele: return createTCETargetCodeGenInfo(CGM); case llvm::Triple::x86: { bool IsDarwinVectorABI = Triple.isOSDarwin(); bool IsWin32FloatStructABI = Triple.isOSWindows() && !Triple.isOSCygMing(); if (Triple.getOS() == llvm::Triple::Win32) { return createWinX86_32TargetCodeGenInfo( CGM, IsDarwinVectorABI, IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters); } return createX86_32TargetCodeGenInfo( CGM, IsDarwinVectorABI, IsWin32FloatStructABI, CodeGenOpts.NumRegisterParameters, CodeGenOpts.FloatABI == "soft"); } case llvm::Triple::x86_64: { StringRef ABI = Target.getABI(); X86AVXABILevel AVXLevel = (ABI == "avx512" ? X86AVXABILevel::AVX512 : ABI == "avx" ? X86AVXABILevel::AVX : X86AVXABILevel::None); switch (Triple.getOS()) { case llvm::Triple::Win32: return createWinX86_64TargetCodeGenInfo(CGM, AVXLevel); default: return createX86_64TargetCodeGenInfo(CGM, AVXLevel); } } case llvm::Triple::hexagon: return createHexagonTargetCodeGenInfo(CGM); case llvm::Triple::lanai: return createLanaiTargetCodeGenInfo(CGM); case llvm::Triple::r600: return createAMDGPUTargetCodeGenInfo(CGM); case llvm::Triple::amdgcn: return createAMDGPUTargetCodeGenInfo(CGM); case llvm::Triple::sparc: return createSparcV8TargetCodeGenInfo(CGM); case llvm::Triple::sparcv9: return createSparcV9TargetCodeGenInfo(CGM); case llvm::Triple::xcore: return createXCoreTargetCodeGenInfo(CGM); case llvm::Triple::arc: return createARCTargetCodeGenInfo(CGM); case llvm::Triple::spir: case llvm::Triple::spir64: return createCommonSPIRTargetCodeGenInfo(CGM); case llvm::Triple::spirv32: case llvm::Triple::spirv64: return createSPIRVTargetCodeGenInfo(CGM); case llvm::Triple::ve: return createVETargetCodeGenInfo(CGM); case llvm::Triple::csky: { bool IsSoftFloat = !Target.hasFeature("hard-float-abi"); bool hasFP64 = Target.hasFeature("fpuv2_df") || Target.hasFeature("fpuv3_df"); return createCSKYTargetCodeGenInfo(CGM, IsSoftFloat ? 0 : hasFP64 ? 64 : 32); } case llvm::Triple::bpfeb: case llvm::Triple::bpfel: return createBPFTargetCodeGenInfo(CGM); case llvm::Triple::loongarch32: case llvm::Triple::loongarch64: { StringRef ABIStr = Target.getABI(); unsigned ABIFRLen = 0; if (ABIStr.endswith("f")) ABIFRLen = 32; else if (ABIStr.endswith("d")) ABIFRLen = 64; return createLoongArchTargetCodeGenInfo( CGM, Target.getPointerWidth(LangAS::Default), ABIFRLen); } } } const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() { if (!TheTargetCodeGenInfo) TheTargetCodeGenInfo = createTargetCodeGenInfo(*this); return *TheTargetCodeGenInfo; } CodeGenModule::CodeGenModule(ASTContext &C, IntrusiveRefCntPtr FS, const HeaderSearchOptions &HSO, const PreprocessorOptions &PPO, const CodeGenOptions &CGO, llvm::Module &M, DiagnosticsEngine &diags, CoverageSourceInfo *CoverageInfo) : Context(C), LangOpts(C.getLangOpts()), FS(FS), HeaderSearchOpts(HSO), PreprocessorOpts(PPO), CodeGenOpts(CGO), TheModule(M), Diags(diags), Target(C.getTargetInfo()), ABI(createCXXABI(*this)), VMContext(M.getContext()), Types(*this), VTables(*this), SanitizerMD(new SanitizerMetadata(*this)) { // Initialize the type cache. llvm::LLVMContext &LLVMContext = M.getContext(); VoidTy = llvm::Type::getVoidTy(LLVMContext); Int8Ty = llvm::Type::getInt8Ty(LLVMContext); Int16Ty = llvm::Type::getInt16Ty(LLVMContext); Int32Ty = llvm::Type::getInt32Ty(LLVMContext); Int64Ty = llvm::Type::getInt64Ty(LLVMContext); HalfTy = llvm::Type::getHalfTy(LLVMContext); BFloatTy = llvm::Type::getBFloatTy(LLVMContext); FloatTy = llvm::Type::getFloatTy(LLVMContext); DoubleTy = llvm::Type::getDoubleTy(LLVMContext); PointerWidthInBits = C.getTargetInfo().getPointerWidth(LangAS::Default); PointerAlignInBytes = C.toCharUnitsFromBits(C.getTargetInfo().getPointerAlign(LangAS::Default)) .getQuantity(); SizeSizeInBytes = C.toCharUnitsFromBits(C.getTargetInfo().getMaxPointerWidth()).getQuantity(); IntAlignInBytes = C.toCharUnitsFromBits(C.getTargetInfo().getIntAlign()).getQuantity(); CharTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getCharWidth()); IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth()); IntPtrTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getMaxPointerWidth()); Int8PtrTy = Int8Ty->getPointerTo(0); Int8PtrPtrTy = Int8PtrTy->getPointerTo(0); const llvm::DataLayout &DL = M.getDataLayout(); AllocaInt8PtrTy = Int8Ty->getPointerTo(DL.getAllocaAddrSpace()); GlobalsInt8PtrTy = Int8Ty->getPointerTo(DL.getDefaultGlobalsAddressSpace()); ConstGlobalsPtrTy = Int8Ty->getPointerTo( C.getTargetAddressSpace(GetGlobalConstantAddressSpace())); ASTAllocaAddressSpace = getTargetCodeGenInfo().getASTAllocaAddressSpace(); // Build C++20 Module initializers. // TODO: Add Microsoft here once we know the mangling required for the // initializers. CXX20ModuleInits = LangOpts.CPlusPlusModules && getCXXABI().getMangleContext().getKind() == ItaniumMangleContext::MK_Itanium; RuntimeCC = getTargetCodeGenInfo().getABIInfo().getRuntimeCC(); if (LangOpts.ObjC) createObjCRuntime(); if (LangOpts.OpenCL) createOpenCLRuntime(); if (LangOpts.OpenMP) createOpenMPRuntime(); if (LangOpts.CUDA) createCUDARuntime(); if (LangOpts.HLSL) createHLSLRuntime(); // Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0. if (LangOpts.Sanitize.has(SanitizerKind::Thread) || (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0)) TBAA.reset(new CodeGenTBAA(Context, TheModule, CodeGenOpts, getLangOpts(), getCXXABI().getMangleContext())); // If debug info or coverage generation is enabled, create the CGDebugInfo // object. if (CodeGenOpts.getDebugInfo() != llvm::codegenoptions::NoDebugInfo || CodeGenOpts.CoverageNotesFile.size() || CodeGenOpts.CoverageDataFile.size()) DebugInfo.reset(new CGDebugInfo(*this)); Block.GlobalUniqueCount = 0; if (C.getLangOpts().ObjC) ObjCData.reset(new ObjCEntrypoints()); if (CodeGenOpts.hasProfileClangUse()) { auto ReaderOrErr = llvm::IndexedInstrProfReader::create( CodeGenOpts.ProfileInstrumentUsePath, *FS, CodeGenOpts.ProfileRemappingFile); // We're checking for profile read errors in CompilerInvocation, so if // there was an error it should've already been caught. If it hasn't been // somehow, trip an assertion. assert(ReaderOrErr); PGOReader = std::move(ReaderOrErr.get()); } // If coverage mapping generation is enabled, create the // CoverageMappingModuleGen object. if (CodeGenOpts.CoverageMapping) CoverageMapping.reset(new CoverageMappingModuleGen(*this, *CoverageInfo)); // Generate the module name hash here if needed. if (CodeGenOpts.UniqueInternalLinkageNames && !getModule().getSourceFileName().empty()) { std::string Path = getModule().getSourceFileName(); // Check if a path substitution is needed from the MacroPrefixMap. for (const auto &Entry : LangOpts.MacroPrefixMap) if (Path.rfind(Entry.first, 0) != std::string::npos) { Path = Entry.second + Path.substr(Entry.first.size()); break; } ModuleNameHash = llvm::getUniqueInternalLinkagePostfix(Path); } } CodeGenModule::~CodeGenModule() {} void CodeGenModule::createObjCRuntime() { // This is just isGNUFamily(), but we want to force implementors of // new ABIs to decide how best to do this. switch (LangOpts.ObjCRuntime.getKind()) { case ObjCRuntime::GNUstep: case ObjCRuntime::GCC: case ObjCRuntime::ObjFW: ObjCRuntime.reset(CreateGNUObjCRuntime(*this)); return; case ObjCRuntime::FragileMacOSX: case ObjCRuntime::MacOSX: case ObjCRuntime::iOS: case ObjCRuntime::WatchOS: ObjCRuntime.reset(CreateMacObjCRuntime(*this)); return; } llvm_unreachable("bad runtime kind"); } void CodeGenModule::createOpenCLRuntime() { OpenCLRuntime.reset(new CGOpenCLRuntime(*this)); } void CodeGenModule::createOpenMPRuntime() { // Select a specialized code generation class based on the target, if any. // If it does not exist use the default implementation. switch (getTriple().getArch()) { case llvm::Triple::nvptx: case llvm::Triple::nvptx64: case llvm::Triple::amdgcn: assert(getLangOpts().OpenMPIsTargetDevice && "OpenMP AMDGPU/NVPTX is only prepared to deal with device code."); OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); break; default: if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); else OpenMPRuntime.reset(new CGOpenMPRuntime(*this)); break; } } void CodeGenModule::createCUDARuntime() { CUDARuntime.reset(CreateNVCUDARuntime(*this)); } void CodeGenModule::createHLSLRuntime() { HLSLRuntime.reset(new CGHLSLRuntime(*this)); } void CodeGenModule::addReplacement(StringRef Name, llvm::Constant *C) { Replacements[Name] = C; } void CodeGenModule::applyReplacements() { for (auto &I : Replacements) { StringRef MangledName = I.first; llvm::Constant *Replacement = I.second; llvm::GlobalValue *Entry = GetGlobalValue(MangledName); if (!Entry) continue; auto *OldF = cast(Entry); auto *NewF = dyn_cast(Replacement); if (!NewF) { if (auto *Alias = dyn_cast(Replacement)) { NewF = dyn_cast(Alias->getAliasee()); } else { auto *CE = cast(Replacement); assert(CE->getOpcode() == llvm::Instruction::BitCast || CE->getOpcode() == llvm::Instruction::GetElementPtr); NewF = dyn_cast(CE->getOperand(0)); } } // Replace old with new, but keep the old order. OldF->replaceAllUsesWith(Replacement); if (NewF) { NewF->removeFromParent(); OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); } OldF->eraseFromParent(); } } void CodeGenModule::addGlobalValReplacement(llvm::GlobalValue *GV, llvm::Constant *C) { GlobalValReplacements.push_back(std::make_pair(GV, C)); } void CodeGenModule::applyGlobalValReplacements() { for (auto &I : GlobalValReplacements) { llvm::GlobalValue *GV = I.first; llvm::Constant *C = I.second; GV->replaceAllUsesWith(C); GV->eraseFromParent(); } } // This is only used in aliases that we created and we know they have a // linear structure. static const llvm::GlobalValue *getAliasedGlobal(const llvm::GlobalValue *GV) { const llvm::Constant *C; if (auto *GA = dyn_cast(GV)) C = GA->getAliasee(); else if (auto *GI = dyn_cast(GV)) C = GI->getResolver(); else return GV; const auto *AliaseeGV = dyn_cast(C->stripPointerCasts()); if (!AliaseeGV) return nullptr; const llvm::GlobalValue *FinalGV = AliaseeGV->getAliaseeObject(); if (FinalGV == GV) return nullptr; return FinalGV; } static bool checkAliasedGlobal( DiagnosticsEngine &Diags, SourceLocation Location, bool IsIFunc, const llvm::GlobalValue *Alias, const llvm::GlobalValue *&GV, const llvm::MapVector &MangledDeclNames, SourceRange AliasRange) { GV = getAliasedGlobal(Alias); if (!GV) { Diags.Report(Location, diag::err_cyclic_alias) << IsIFunc; return false; } if (GV->isDeclaration()) { Diags.Report(Location, diag::err_alias_to_undefined) << IsIFunc << IsIFunc; Diags.Report(Location, diag::note_alias_requires_mangled_name) << IsIFunc << IsIFunc; // Provide a note if the given function is not found and exists as a // mangled name. for (const auto &[Decl, Name] : MangledDeclNames) { if (const auto *ND = dyn_cast(Decl.getDecl())) { if (ND->getName() == GV->getName()) { Diags.Report(Location, diag::note_alias_mangled_name_alternative) << Name << FixItHint::CreateReplacement( AliasRange, (Twine(IsIFunc ? "ifunc" : "alias") + "(\"" + Name + "\")") .str()); } } } return false; } if (IsIFunc) { // Check resolver function type. const auto *F = dyn_cast(GV); if (!F) { Diags.Report(Location, diag::err_alias_to_undefined) << IsIFunc << IsIFunc; return false; } llvm::FunctionType *FTy = F->getFunctionType(); if (!FTy->getReturnType()->isPointerTy()) { Diags.Report(Location, diag::err_ifunc_resolver_return); return false; } } return true; } void CodeGenModule::checkAliases() { // Check if the constructed aliases are well formed. It is really unfortunate // that we have to do this in CodeGen, but we only construct mangled names // and aliases during codegen. bool Error = false; DiagnosticsEngine &Diags = getDiags(); for (const GlobalDecl &GD : Aliases) { const auto *D = cast(GD.getDecl()); SourceLocation Location; SourceRange Range; bool IsIFunc = D->hasAttr(); if (const Attr *A = D->getDefiningAttr()) { Location = A->getLocation(); Range = A->getRange(); } else llvm_unreachable("Not an alias or ifunc?"); StringRef MangledName = getMangledName(GD); llvm::GlobalValue *Alias = GetGlobalValue(MangledName); const llvm::GlobalValue *GV = nullptr; if (!checkAliasedGlobal(Diags, Location, IsIFunc, Alias, GV, MangledDeclNames, Range)) { Error = true; continue; } llvm::Constant *Aliasee = IsIFunc ? cast(Alias)->getResolver() : cast(Alias)->getAliasee(); llvm::GlobalValue *AliaseeGV; if (auto CE = dyn_cast(Aliasee)) AliaseeGV = cast(CE->getOperand(0)); else AliaseeGV = cast(Aliasee); if (const SectionAttr *SA = D->getAttr()) { StringRef AliasSection = SA->getName(); if (AliasSection != AliaseeGV->getSection()) Diags.Report(SA->getLocation(), diag::warn_alias_with_section) << AliasSection << IsIFunc << IsIFunc; } // We have to handle alias to weak aliases in here. LLVM itself disallows // this since the object semantics would not match the IL one. For // compatibility with gcc we implement it by just pointing the alias // to its aliasee's aliasee. We also warn, since the user is probably // expecting the link to be weak. if (auto *GA = dyn_cast(AliaseeGV)) { if (GA->isInterposable()) { Diags.Report(Location, diag::warn_alias_to_weak_alias) << GV->getName() << GA->getName() << IsIFunc; Aliasee = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( GA->getAliasee(), Alias->getType()); if (IsIFunc) cast(Alias)->setResolver(Aliasee); else cast(Alias)->setAliasee(Aliasee); } } } if (!Error) return; for (const GlobalDecl &GD : Aliases) { StringRef MangledName = getMangledName(GD); llvm::GlobalValue *Alias = GetGlobalValue(MangledName); Alias->replaceAllUsesWith(llvm::UndefValue::get(Alias->getType())); Alias->eraseFromParent(); } } void CodeGenModule::clear() { DeferredDeclsToEmit.clear(); EmittedDeferredDecls.clear(); if (OpenMPRuntime) OpenMPRuntime->clear(); } void InstrProfStats::reportDiagnostics(DiagnosticsEngine &Diags, StringRef MainFile) { if (!hasDiagnostics()) return; if (VisitedInMainFile > 0 && VisitedInMainFile == MissingInMainFile) { if (MainFile.empty()) MainFile = ""; Diags.Report(diag::warn_profile_data_unprofiled) << MainFile; } else { if (Mismatched > 0) Diags.Report(diag::warn_profile_data_out_of_date) << Visited << Mismatched; if (Missing > 0) Diags.Report(diag::warn_profile_data_missing) << Visited << Missing; } } static void setVisibilityFromDLLStorageClass(const clang::LangOptions &LO, llvm::Module &M) { if (!LO.VisibilityFromDLLStorageClass) return; llvm::GlobalValue::VisibilityTypes DLLExportVisibility = CodeGenModule::GetLLVMVisibility(LO.getDLLExportVisibility()); llvm::GlobalValue::VisibilityTypes NoDLLStorageClassVisibility = CodeGenModule::GetLLVMVisibility(LO.getNoDLLStorageClassVisibility()); llvm::GlobalValue::VisibilityTypes ExternDeclDLLImportVisibility = CodeGenModule::GetLLVMVisibility(LO.getExternDeclDLLImportVisibility()); llvm::GlobalValue::VisibilityTypes ExternDeclNoDLLStorageClassVisibility = CodeGenModule::GetLLVMVisibility( LO.getExternDeclNoDLLStorageClassVisibility()); for (llvm::GlobalValue &GV : M.global_values()) { if (GV.hasAppendingLinkage() || GV.hasLocalLinkage()) continue; // Reset DSO locality before setting the visibility. This removes // any effects that visibility options and annotations may have // had on the DSO locality. Setting the visibility will implicitly set // appropriate globals to DSO Local; however, this will be pessimistic // w.r.t. to the normal compiler IRGen. GV.setDSOLocal(false); if (GV.isDeclarationForLinker()) { GV.setVisibility(GV.getDLLStorageClass() == llvm::GlobalValue::DLLImportStorageClass ? ExternDeclDLLImportVisibility : ExternDeclNoDLLStorageClassVisibility); } else { GV.setVisibility(GV.getDLLStorageClass() == llvm::GlobalValue::DLLExportStorageClass ? DLLExportVisibility : NoDLLStorageClassVisibility); } GV.setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); } } void CodeGenModule::Release() { Module *Primary = getContext().getCurrentNamedModule(); if (CXX20ModuleInits && Primary && !Primary->isHeaderLikeModule()) EmitModuleInitializers(Primary); EmitDeferred(); DeferredDecls.insert(EmittedDeferredDecls.begin(), EmittedDeferredDecls.end()); EmittedDeferredDecls.clear(); EmitVTablesOpportunistically(); applyGlobalValReplacements(); applyReplacements(); emitMultiVersionFunctions(); if (Context.getLangOpts().IncrementalExtensions && GlobalTopLevelStmtBlockInFlight.first) { const TopLevelStmtDecl *TLSD = GlobalTopLevelStmtBlockInFlight.second; GlobalTopLevelStmtBlockInFlight.first->FinishFunction(TLSD->getEndLoc()); GlobalTopLevelStmtBlockInFlight = {nullptr, nullptr}; } // Module implementations are initialized the same way as a regular TU that // imports one or more modules. if (CXX20ModuleInits && Primary && Primary->isInterfaceOrPartition()) EmitCXXModuleInitFunc(Primary); else EmitCXXGlobalInitFunc(); EmitCXXGlobalCleanUpFunc(); registerGlobalDtorsWithAtExit(); EmitCXXThreadLocalInitFunc(); if (ObjCRuntime) if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction()) AddGlobalCtor(ObjCInitFunction); if (Context.getLangOpts().CUDA && CUDARuntime) { if (llvm::Function *CudaCtorFunction = CUDARuntime->finalizeModule()) AddGlobalCtor(CudaCtorFunction); } if (OpenMPRuntime) { if (llvm::Function *OpenMPRequiresDirectiveRegFun = OpenMPRuntime->emitRequiresDirectiveRegFun()) { AddGlobalCtor(OpenMPRequiresDirectiveRegFun, 0); } OpenMPRuntime->createOffloadEntriesAndInfoMetadata(); OpenMPRuntime->clear(); } if (PGOReader) { getModule().setProfileSummary( PGOReader->getSummary(/* UseCS */ false).getMD(VMContext), llvm::ProfileSummary::PSK_Instr); if (PGOStats.hasDiagnostics()) PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName); } llvm::stable_sort(GlobalCtors, [](const Structor &L, const Structor &R) { return L.LexOrder < R.LexOrder; }); EmitCtorList(GlobalCtors, "llvm.global_ctors"); EmitCtorList(GlobalDtors, "llvm.global_dtors"); EmitGlobalAnnotations(); EmitStaticExternCAliases(); checkAliases(); EmitDeferredUnusedCoverageMappings(); CodeGenPGO(*this).setValueProfilingFlag(getModule()); if (CoverageMapping) CoverageMapping->emit(); if (CodeGenOpts.SanitizeCfiCrossDso) { CodeGenFunction(*this).EmitCfiCheckFail(); CodeGenFunction(*this).EmitCfiCheckStub(); } if (LangOpts.Sanitize.has(SanitizerKind::KCFI)) finalizeKCFITypes(); emitAtAvailableLinkGuard(); if (Context.getTargetInfo().getTriple().isWasm()) EmitMainVoidAlias(); if (getTriple().isAMDGPU()) { // Emit amdgpu_code_object_version module flag, which is code object version // times 100. if (getTarget().getTargetOpts().CodeObjectVersion != TargetOptions::COV_None) { getModule().addModuleFlag(llvm::Module::Error, "amdgpu_code_object_version", getTarget().getTargetOpts().CodeObjectVersion); } // Currently, "-mprintf-kind" option is only supported for HIP if (LangOpts.HIP) { auto *MDStr = llvm::MDString::get( getLLVMContext(), (getTarget().getTargetOpts().AMDGPUPrintfKindVal == TargetOptions::AMDGPUPrintfKind::Hostcall) ? "hostcall" : "buffered"); getModule().addModuleFlag(llvm::Module::Error, "amdgpu_printf_kind", MDStr); } } // Emit a global array containing all external kernels or device variables // used by host functions and mark it as used for CUDA/HIP. This is necessary // to get kernels or device variables in archives linked in even if these // kernels or device variables are only used in host functions. if (!Context.CUDAExternalDeviceDeclODRUsedByHost.empty()) { SmallVector UsedArray; for (auto D : Context.CUDAExternalDeviceDeclODRUsedByHost) { GlobalDecl GD; if (auto *FD = dyn_cast(D)) GD = GlobalDecl(FD, KernelReferenceKind::Kernel); else GD = GlobalDecl(D); UsedArray.push_back(llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( GetAddrOfGlobal(GD), Int8PtrTy)); } llvm::ArrayType *ATy = llvm::ArrayType::get(Int8PtrTy, UsedArray.size()); auto *GV = new llvm::GlobalVariable( getModule(), ATy, false, llvm::GlobalValue::InternalLinkage, llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external"); addCompilerUsedGlobal(GV); } emitLLVMUsed(); if (SanStats) SanStats->finish(); if (CodeGenOpts.Autolink && (Context.getLangOpts().Modules || !LinkerOptionsMetadata.empty())) { EmitModuleLinkOptions(); } // On ELF we pass the dependent library specifiers directly to the linker // without manipulating them. This is in contrast to other platforms where // they are mapped to a specific linker option by the compiler. This // difference is a result of the greater variety of ELF linkers and the fact // that ELF linkers tend to handle libraries in a more complicated fashion // than on other platforms. This forces us to defer handling the dependent // libs to the linker. // // CUDA/HIP device and host libraries are different. Currently there is no // way to differentiate dependent libraries for host or device. Existing // usage of #pragma comment(lib, *) is intended for host libraries on // Windows. Therefore emit llvm.dependent-libraries only for host. if (!ELFDependentLibraries.empty() && !Context.getLangOpts().CUDAIsDevice) { auto *NMD = getModule().getOrInsertNamedMetadata("llvm.dependent-libraries"); for (auto *MD : ELFDependentLibraries) NMD->addOperand(MD); } // Record mregparm value now so it is visible through rest of codegen. if (Context.getTargetInfo().getTriple().getArch() == llvm::Triple::x86) getModule().addModuleFlag(llvm::Module::Error, "NumRegisterParameters", CodeGenOpts.NumRegisterParameters); if (CodeGenOpts.DwarfVersion) { getModule().addModuleFlag(llvm::Module::Max, "Dwarf Version", CodeGenOpts.DwarfVersion); } if (CodeGenOpts.Dwarf64) getModule().addModuleFlag(llvm::Module::Max, "DWARF64", 1); if (Context.getLangOpts().SemanticInterposition) // Require various optimization to respect semantic interposition. getModule().setSemanticInterposition(true); if (CodeGenOpts.EmitCodeView) { // Indicate that we want CodeView in the metadata. getModule().addModuleFlag(llvm::Module::Warning, "CodeView", 1); } if (CodeGenOpts.CodeViewGHash) { getModule().addModuleFlag(llvm::Module::Warning, "CodeViewGHash", 1); } if (CodeGenOpts.ControlFlowGuard) { // Function ID tables and checks for Control Flow Guard (cfguard=2). getModule().addModuleFlag(llvm::Module::Warning, "cfguard", 2); } else if (CodeGenOpts.ControlFlowGuardNoChecks) { // Function ID tables for Control Flow Guard (cfguard=1). getModule().addModuleFlag(llvm::Module::Warning, "cfguard", 1); } if (CodeGenOpts.EHContGuard) { // Function ID tables for EH Continuation Guard. getModule().addModuleFlag(llvm::Module::Warning, "ehcontguard", 1); } if (Context.getLangOpts().Kernel) { // Note if we are compiling with /kernel. getModule().addModuleFlag(llvm::Module::Warning, "ms-kernel", 1); } if (CodeGenOpts.OptimizationLevel > 0 && CodeGenOpts.StrictVTablePointers) { // We don't support LTO with 2 with different StrictVTablePointers // FIXME: we could support it by stripping all the information introduced // by StrictVTablePointers. getModule().addModuleFlag(llvm::Module::Error, "StrictVTablePointers",1); llvm::Metadata *Ops[2] = { llvm::MDString::get(VMContext, "StrictVTablePointers"), llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( llvm::Type::getInt32Ty(VMContext), 1))}; getModule().addModuleFlag(llvm::Module::Require, "StrictVTablePointersRequirement", llvm::MDNode::get(VMContext, Ops)); } if (getModuleDebugInfo()) // We support a single version in the linked module. The LLVM // parser will drop debug info with a different version number // (and warn about it, too). getModule().addModuleFlag(llvm::Module::Warning, "Debug Info Version", llvm::DEBUG_METADATA_VERSION); // We need to record the widths of enums and wchar_t, so that we can generate // the correct build attributes in the ARM backend. wchar_size is also used by // TargetLibraryInfo. uint64_t WCharWidth = Context.getTypeSizeInChars(Context.getWideCharType()).getQuantity(); getModule().addModuleFlag(llvm::Module::Error, "wchar_size", WCharWidth); llvm::Triple::ArchType Arch = Context.getTargetInfo().getTriple().getArch(); if ( Arch == llvm::Triple::arm || Arch == llvm::Triple::armeb || Arch == llvm::Triple::thumb || Arch == llvm::Triple::thumbeb) { // The minimum width of an enum in bytes uint64_t EnumWidth = Context.getLangOpts().ShortEnums ? 1 : 4; getModule().addModuleFlag(llvm::Module::Error, "min_enum_size", EnumWidth); } if (Arch == llvm::Triple::riscv32 || Arch == llvm::Triple::riscv64) { StringRef ABIStr = Target.getABI(); llvm::LLVMContext &Ctx = TheModule.getContext(); getModule().addModuleFlag(llvm::Module::Error, "target-abi", llvm::MDString::get(Ctx, ABIStr)); } if (CodeGenOpts.SanitizeCfiCrossDso) { // Indicate that we want cross-DSO control flow integrity checks. getModule().addModuleFlag(llvm::Module::Override, "Cross-DSO CFI", 1); } if (CodeGenOpts.WholeProgramVTables) { // Indicate whether VFE was enabled for this module, so that the // vcall_visibility metadata added under whole program vtables is handled // appropriately in the optimizer. getModule().addModuleFlag(llvm::Module::Error, "Virtual Function Elim", CodeGenOpts.VirtualFunctionElimination); } if (LangOpts.Sanitize.has(SanitizerKind::CFIICall)) { getModule().addModuleFlag(llvm::Module::Override, "CFI Canonical Jump Tables", CodeGenOpts.SanitizeCfiCanonicalJumpTables); } if (LangOpts.Sanitize.has(SanitizerKind::KCFI)) { getModule().addModuleFlag(llvm::Module::Override, "kcfi", 1); // KCFI assumes patchable-function-prefix is the same for all indirectly // called functions. Store the expected offset for code generation. if (CodeGenOpts.PatchableFunctionEntryOffset) getModule().addModuleFlag(llvm::Module::Override, "kcfi-offset", CodeGenOpts.PatchableFunctionEntryOffset); } if (CodeGenOpts.CFProtectionReturn && Target.checkCFProtectionReturnSupported(getDiags())) { // Indicate that we want to instrument return control flow protection. getModule().addModuleFlag(llvm::Module::Min, "cf-protection-return", 1); } if (CodeGenOpts.CFProtectionBranch && Target.checkCFProtectionBranchSupported(getDiags())) { // Indicate that we want to instrument branch control flow protection. getModule().addModuleFlag(llvm::Module::Min, "cf-protection-branch", 1); } if (CodeGenOpts.FunctionReturnThunks) getModule().addModuleFlag(llvm::Module::Override, "function_return_thunk_extern", 1); if (CodeGenOpts.IndirectBranchCSPrefix) getModule().addModuleFlag(llvm::Module::Override, "indirect_branch_cs_prefix", 1); // Add module metadata for return address signing (ignoring // non-leaf/all) and stack tagging. These are actually turned on by function // attributes, but we use module metadata to emit build attributes. This is // needed for LTO, where the function attributes are inside bitcode // serialised into a global variable by the time build attributes are // emitted, so we can't access them. LTO objects could be compiled with // different flags therefore module flags are set to "Min" behavior to achieve // the same end result of the normal build where e.g BTI is off if any object // doesn't support it. if (Context.getTargetInfo().hasFeature("ptrauth") && LangOpts.getSignReturnAddressScope() != LangOptions::SignReturnAddressScopeKind::None) getModule().addModuleFlag(llvm::Module::Override, "sign-return-address-buildattr", 1); if (LangOpts.Sanitize.has(SanitizerKind::MemtagStack)) getModule().addModuleFlag(llvm::Module::Override, "tag-stack-memory-buildattr", 1); if (Arch == llvm::Triple::thumb || Arch == llvm::Triple::thumbeb || Arch == llvm::Triple::arm || Arch == llvm::Triple::armeb || Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_32 || Arch == llvm::Triple::aarch64_be) { if (LangOpts.BranchTargetEnforcement) getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement", 1); if (LangOpts.hasSignReturnAddress()) getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1); if (LangOpts.isSignReturnAddressScopeAll()) getModule().addModuleFlag(llvm::Module::Min, "sign-return-address-all", 1); if (!LangOpts.isSignReturnAddressWithAKey()) getModule().addModuleFlag(llvm::Module::Min, "sign-return-address-with-bkey", 1); } if (!CodeGenOpts.MemoryProfileOutput.empty()) { llvm::LLVMContext &Ctx = TheModule.getContext(); getModule().addModuleFlag( llvm::Module::Error, "MemProfProfileFilename", llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput)); } if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) { // Indicate whether __nvvm_reflect should be configured to flush denormal // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", CodeGenOpts.FP32DenormalMode.Output != llvm::DenormalMode::IEEE); } if (LangOpts.EHAsynch) getModule().addModuleFlag(llvm::Module::Warning, "eh-asynch", 1); // Indicate whether this Module was compiled with -fopenmp if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd) getModule().addModuleFlag(llvm::Module::Max, "openmp", LangOpts.OpenMP); if (getLangOpts().OpenMPIsTargetDevice) getModule().addModuleFlag(llvm::Module::Max, "openmp-device", LangOpts.OpenMP); // Emit OpenCL specific module metadata: OpenCL/SPIR version. if (LangOpts.OpenCL || (LangOpts.CUDAIsDevice && getTriple().isSPIRV())) { EmitOpenCLMetadata(); // Emit SPIR version. if (getTriple().isSPIR()) { // SPIR v2.0 s2.12 - The SPIR version used by the module is stored in the // opencl.spir.version named metadata. // C++ for OpenCL has a distinct mapping for version compatibility with // OpenCL. auto Version = LangOpts.getOpenCLCompatibleVersion(); llvm::Metadata *SPIRVerElts[] = { llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( Int32Ty, Version / 100)), llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( Int32Ty, (Version / 100 > 1) ? 0 : 2))}; llvm::NamedMDNode *SPIRVerMD = TheModule.getOrInsertNamedMetadata("opencl.spir.version"); llvm::LLVMContext &Ctx = TheModule.getContext(); SPIRVerMD->addOperand(llvm::MDNode::get(Ctx, SPIRVerElts)); } } // HLSL related end of code gen work items. if (LangOpts.HLSL) getHLSLRuntime().finishCodeGen(); if (uint32_t PLevel = Context.getLangOpts().PICLevel) { assert(PLevel < 3 && "Invalid PIC Level"); getModule().setPICLevel(static_cast(PLevel)); if (Context.getLangOpts().PIE) getModule().setPIELevel(static_cast(PLevel)); } if (getCodeGenOpts().CodeModel.size() > 0) { unsigned CM = llvm::StringSwitch(getCodeGenOpts().CodeModel) .Case("tiny", llvm::CodeModel::Tiny) .Case("small", llvm::CodeModel::Small) .Case("kernel", llvm::CodeModel::Kernel) .Case("medium", llvm::CodeModel::Medium) .Case("large", llvm::CodeModel::Large) .Default(~0u); if (CM != ~0u) { llvm::CodeModel::Model codeModel = static_cast(CM); getModule().setCodeModel(codeModel); } } if (CodeGenOpts.NoPLT) getModule().setRtLibUseGOT(); if (getTriple().isOSBinFormatELF() && CodeGenOpts.DirectAccessExternalData != getModule().getDirectAccessExternalData()) { getModule().setDirectAccessExternalData( CodeGenOpts.DirectAccessExternalData); } if (CodeGenOpts.UnwindTables) getModule().setUwtable(llvm::UWTableKind(CodeGenOpts.UnwindTables)); switch (CodeGenOpts.getFramePointer()) { case CodeGenOptions::FramePointerKind::None: // 0 ("none") is the default. break; case CodeGenOptions::FramePointerKind::NonLeaf: getModule().setFramePointer(llvm::FramePointerKind::NonLeaf); break; case CodeGenOptions::FramePointerKind::All: getModule().setFramePointer(llvm::FramePointerKind::All); break; } SimplifyPersonality(); if (getCodeGenOpts().EmitDeclMetadata) EmitDeclMetadata(); if (getCodeGenOpts().CoverageNotesFile.size() || getCodeGenOpts().CoverageDataFile.size()) EmitCoverageFile(); if (CGDebugInfo *DI = getModuleDebugInfo()) DI->finalize(); if (getCodeGenOpts().EmitVersionIdentMetadata) EmitVersionIdentMetadata(); if (!getCodeGenOpts().RecordCommandLine.empty()) EmitCommandLineMetadata(); if (!getCodeGenOpts().StackProtectorGuard.empty()) getModule().setStackProtectorGuard(getCodeGenOpts().StackProtectorGuard); if (!getCodeGenOpts().StackProtectorGuardReg.empty()) getModule().setStackProtectorGuardReg( getCodeGenOpts().StackProtectorGuardReg); if (!getCodeGenOpts().StackProtectorGuardSymbol.empty()) getModule().setStackProtectorGuardSymbol( getCodeGenOpts().StackProtectorGuardSymbol); if (getCodeGenOpts().StackProtectorGuardOffset != INT_MAX) getModule().setStackProtectorGuardOffset( getCodeGenOpts().StackProtectorGuardOffset); if (getCodeGenOpts().StackAlignment) getModule().setOverrideStackAlignment(getCodeGenOpts().StackAlignment); if (getCodeGenOpts().SkipRaxSetup) getModule().addModuleFlag(llvm::Module::Override, "SkipRaxSetup", 1); if (getContext().getTargetInfo().getMaxTLSAlign()) getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign", getContext().getTargetInfo().getMaxTLSAlign()); getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames); EmitBackendOptionsMetadata(getCodeGenOpts()); // If there is device offloading code embed it in the host now. EmbedObject(&getModule(), CodeGenOpts, getDiags()); // Set visibility from DLL storage class // We do this at the end of LLVM IR generation; after any operation // that might affect the DLL storage class or the visibility, and // before anything that might act on these. setVisibilityFromDLLStorageClass(LangOpts, getModule()); } void CodeGenModule::EmitOpenCLMetadata() { // SPIR v2.0 s2.13 - The OpenCL version used by the module is stored in the // opencl.ocl.version named metadata node. // C++ for OpenCL has a distinct mapping for versions compatibile with OpenCL. auto Version = LangOpts.getOpenCLCompatibleVersion(); llvm::Metadata *OCLVerElts[] = { llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( Int32Ty, Version / 100)), llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( Int32Ty, (Version % 100) / 10))}; llvm::NamedMDNode *OCLVerMD = TheModule.getOrInsertNamedMetadata("opencl.ocl.version"); llvm::LLVMContext &Ctx = TheModule.getContext(); OCLVerMD->addOperand(llvm::MDNode::get(Ctx, OCLVerElts)); } void CodeGenModule::EmitBackendOptionsMetadata( const CodeGenOptions &CodeGenOpts) { if (getTriple().isRISCV()) { getModule().addModuleFlag(llvm::Module::Min, "SmallDataLimit", CodeGenOpts.SmallDataLimit); } } void CodeGenModule::UpdateCompletedType(const TagDecl *TD) { // Make sure that this type is translated. Types.UpdateCompletedType(TD); } void CodeGenModule::RefreshTypeCacheForClass(const CXXRecordDecl *RD) { // Make sure that this type is translated. Types.RefreshTypeCacheForClass(RD); } llvm::MDNode *CodeGenModule::getTBAATypeInfo(QualType QTy) { if (!TBAA) return nullptr; return TBAA->getTypeInfo(QTy); } TBAAAccessInfo CodeGenModule::getTBAAAccessInfo(QualType AccessType) { if (!TBAA) return TBAAAccessInfo(); if (getLangOpts().CUDAIsDevice) { // As CUDA builtin surface/texture types are replaced, skip generating TBAA // access info. if (AccessType->isCUDADeviceBuiltinSurfaceType()) { if (getTargetCodeGenInfo().getCUDADeviceBuiltinSurfaceDeviceType() != nullptr) return TBAAAccessInfo(); } else if (AccessType->isCUDADeviceBuiltinTextureType()) { if (getTargetCodeGenInfo().getCUDADeviceBuiltinTextureDeviceType() != nullptr) return TBAAAccessInfo(); } } return TBAA->getAccessInfo(AccessType); } TBAAAccessInfo CodeGenModule::getTBAAVTablePtrAccessInfo(llvm::Type *VTablePtrType) { if (!TBAA) return TBAAAccessInfo(); return TBAA->getVTablePtrAccessInfo(VTablePtrType); } llvm::MDNode *CodeGenModule::getTBAAStructInfo(QualType QTy) { if (!TBAA) return nullptr; return TBAA->getTBAAStructInfo(QTy); } llvm::MDNode *CodeGenModule::getTBAABaseTypeInfo(QualType QTy) { if (!TBAA) return nullptr; return TBAA->getBaseTypeInfo(QTy); } llvm::MDNode *CodeGenModule::getTBAAAccessTagInfo(TBAAAccessInfo Info) { if (!TBAA) return nullptr; return TBAA->getAccessTagInfo(Info); } TBAAAccessInfo CodeGenModule::mergeTBAAInfoForCast(TBAAAccessInfo SourceInfo, TBAAAccessInfo TargetInfo) { if (!TBAA) return TBAAAccessInfo(); return TBAA->mergeTBAAInfoForCast(SourceInfo, TargetInfo); } TBAAAccessInfo CodeGenModule::mergeTBAAInfoForConditionalOperator(TBAAAccessInfo InfoA, TBAAAccessInfo InfoB) { if (!TBAA) return TBAAAccessInfo(); return TBAA->mergeTBAAInfoForConditionalOperator(InfoA, InfoB); } TBAAAccessInfo CodeGenModule::mergeTBAAInfoForMemoryTransfer(TBAAAccessInfo DestInfo, TBAAAccessInfo SrcInfo) { if (!TBAA) return TBAAAccessInfo(); return TBAA->mergeTBAAInfoForConditionalOperator(DestInfo, SrcInfo); } void CodeGenModule::DecorateInstructionWithTBAA(llvm::Instruction *Inst, TBAAAccessInfo TBAAInfo) { if (llvm::MDNode *Tag = getTBAAAccessTagInfo(TBAAInfo)) Inst->setMetadata(llvm::LLVMContext::MD_tbaa, Tag); } void CodeGenModule::DecorateInstructionWithInvariantGroup( llvm::Instruction *I, const CXXRecordDecl *RD) { I->setMetadata(llvm::LLVMContext::MD_invariant_group, llvm::MDNode::get(getLLVMContext(), {})); } void CodeGenModule::Error(SourceLocation loc, StringRef message) { unsigned diagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error, "%0"); getDiags().Report(Context.getFullLoc(loc), diagID) << message; } /// ErrorUnsupported - Print out an error that codegen doesn't support the /// specified stmt yet. void CodeGenModule::ErrorUnsupported(const Stmt *S, const char *Type) { unsigned DiagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error, "cannot compile this %0 yet"); std::string Msg = Type; getDiags().Report(Context.getFullLoc(S->getBeginLoc()), DiagID) << Msg << S->getSourceRange(); } /// ErrorUnsupported - Print out an error that codegen doesn't support the /// specified decl yet. void CodeGenModule::ErrorUnsupported(const Decl *D, const char *Type) { unsigned DiagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error, "cannot compile this %0 yet"); std::string Msg = Type; getDiags().Report(Context.getFullLoc(D->getLocation()), DiagID) << Msg; } llvm::ConstantInt *CodeGenModule::getSize(CharUnits size) { return llvm::ConstantInt::get(SizeTy, size.getQuantity()); } void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV, const NamedDecl *D) const { // Internal definitions always have default visibility. if (GV->hasLocalLinkage()) { GV->setVisibility(llvm::GlobalValue::DefaultVisibility); return; } if (!D) return; // Set visibility for definitions, and for declarations if requested globally // or set explicitly. LinkageInfo LV = D->getLinkageAndVisibility(); if (GV->hasDLLExportStorageClass() || GV->hasDLLImportStorageClass()) { // Reject incompatible dlllstorage and visibility annotations. if (!LV.isVisibilityExplicit()) return; if (GV->hasDLLExportStorageClass()) { if (LV.getVisibility() == HiddenVisibility) getDiags().Report(D->getLocation(), diag::err_hidden_visibility_dllexport); } else if (LV.getVisibility() != DefaultVisibility) { getDiags().Report(D->getLocation(), diag::err_non_default_visibility_dllimport); } return; } if (LV.isVisibilityExplicit() || getLangOpts().SetVisibilityForExternDecls || !GV->isDeclarationForLinker()) GV->setVisibility(GetLLVMVisibility(LV.getVisibility())); } static bool shouldAssumeDSOLocal(const CodeGenModule &CGM, llvm::GlobalValue *GV) { if (GV->hasLocalLinkage()) return true; if (!GV->hasDefaultVisibility() && !GV->hasExternalWeakLinkage()) return true; // DLLImport explicitly marks the GV as external. if (GV->hasDLLImportStorageClass()) return false; const llvm::Triple &TT = CGM.getTriple(); if (TT.isWindowsGNUEnvironment()) { // In MinGW, variables without DLLImport can still be automatically // imported from a DLL by the linker; don't mark variables that // potentially could come from another DLL as DSO local. // With EmulatedTLS, TLS variables can be autoimported from other DLLs // (and this actually happens in the public interface of libstdc++), so // such variables can't be marked as DSO local. (Native TLS variables // can't be dllimported at all, though.) if (GV->isDeclarationForLinker() && isa(GV) && (!GV->isThreadLocal() || CGM.getCodeGenOpts().EmulatedTLS)) return false; } // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols // remain unresolved in the link, they can be resolved to zero, which is // outside the current DSO. if (TT.isOSBinFormatCOFF() && GV->hasExternalWeakLinkage()) return false; // Every other GV is local on COFF. // Make an exception for windows OS in the triple: Some firmware builds use // *-win32-macho triples. This (accidentally?) produced windows relocations // without GOT tables in older clang versions; Keep this behaviour. // FIXME: even thread local variables? if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO())) return true; // Only handle COFF and ELF for now. if (!TT.isOSBinFormatELF()) return false; // If this is not an executable, don't assume anything is local. const auto &CGOpts = CGM.getCodeGenOpts(); llvm::Reloc::Model RM = CGOpts.RelocationModel; const auto &LOpts = CGM.getLangOpts(); if (RM != llvm::Reloc::Static && !LOpts.PIE) { // On ELF, if -fno-semantic-interposition is specified and the target // supports local aliases, there will be neither CC1 // -fsemantic-interposition nor -fhalf-no-semantic-interposition. Set // dso_local on the function if using a local alias is preferable (can avoid // PLT indirection). if (!(isa(GV) && GV->canBenefitFromLocalAlias())) return false; return !(CGM.getLangOpts().SemanticInterposition || CGM.getLangOpts().HalfNoSemanticInterposition); } // A definition cannot be preempted from an executable. if (!GV->isDeclarationForLinker()) return true; // Most PIC code sequences that assume that a symbol is local cannot produce a // 0 if it turns out the symbol is undefined. While this is ABI and relocation // depended, it seems worth it to handle it here. if (RM == llvm::Reloc::PIC_ && GV->hasExternalWeakLinkage()) return false; // PowerPC64 prefers TOC indirection to avoid copy relocations. if (TT.isPPC64()) return false; if (CGOpts.DirectAccessExternalData) { // If -fdirect-access-external-data (default for -fno-pic), set dso_local // for non-thread-local variables. If the symbol is not defined in the // executable, a copy relocation will be needed at link time. dso_local is // excluded for thread-local variables because they generally don't support // copy relocations. if (auto *Var = dyn_cast(GV)) if (!Var->isThreadLocal()) return true; // -fno-pic sets dso_local on a function declaration to allow direct // accesses when taking its address (similar to a data symbol). If the // function is not defined in the executable, a canonical PLT entry will be // needed at link time. -fno-direct-access-external-data can avoid the // canonical PLT entry. We don't generalize this condition to -fpie/-fpic as // it could just cause trouble without providing perceptible benefits. if (isa(GV) && !CGOpts.NoPLT && RM == llvm::Reloc::Static) return true; } // If we can use copy relocations we can assume it is local. // Otherwise don't assume it is local. return false; } void CodeGenModule::setDSOLocal(llvm::GlobalValue *GV) const { GV->setDSOLocal(shouldAssumeDSOLocal(*this, GV)); } void CodeGenModule::setDLLImportDLLExport(llvm::GlobalValue *GV, GlobalDecl GD) const { const auto *D = dyn_cast(GD.getDecl()); // C++ destructors have a few C++ ABI specific special cases. if (const auto *Dtor = dyn_cast_or_null(D)) { getCXXABI().setCXXDestructorDLLStorage(GV, Dtor, GD.getDtorType()); return; } setDLLImportDLLExport(GV, D); } void CodeGenModule::setDLLImportDLLExport(llvm::GlobalValue *GV, const NamedDecl *D) const { if (D && D->isExternallyVisible()) { if (D->hasAttr()) GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass); else if ((D->hasAttr() || shouldMapVisibilityToDLLExport(D)) && !GV->isDeclarationForLinker()) GV->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass); } } void CodeGenModule::setGVProperties(llvm::GlobalValue *GV, GlobalDecl GD) const { setDLLImportDLLExport(GV, GD); setGVPropertiesAux(GV, dyn_cast(GD.getDecl())); } void CodeGenModule::setGVProperties(llvm::GlobalValue *GV, const NamedDecl *D) const { setDLLImportDLLExport(GV, D); setGVPropertiesAux(GV, D); } void CodeGenModule::setGVPropertiesAux(llvm::GlobalValue *GV, const NamedDecl *D) const { setGlobalVisibility(GV, D); setDSOLocal(GV); GV->setPartition(CodeGenOpts.SymbolPartition); } static llvm::GlobalVariable::ThreadLocalMode GetLLVMTLSModel(StringRef S) { return llvm::StringSwitch(S) .Case("global-dynamic", llvm::GlobalVariable::GeneralDynamicTLSModel) .Case("local-dynamic", llvm::GlobalVariable::LocalDynamicTLSModel) .Case("initial-exec", llvm::GlobalVariable::InitialExecTLSModel) .Case("local-exec", llvm::GlobalVariable::LocalExecTLSModel); } llvm::GlobalVariable::ThreadLocalMode CodeGenModule::GetDefaultLLVMTLSModel() const { switch (CodeGenOpts.getDefaultTLSModel()) { case CodeGenOptions::GeneralDynamicTLSModel: return llvm::GlobalVariable::GeneralDynamicTLSModel; case CodeGenOptions::LocalDynamicTLSModel: return llvm::GlobalVariable::LocalDynamicTLSModel; case CodeGenOptions::InitialExecTLSModel: return llvm::GlobalVariable::InitialExecTLSModel; case CodeGenOptions::LocalExecTLSModel: return llvm::GlobalVariable::LocalExecTLSModel; } llvm_unreachable("Invalid TLS model!"); } void CodeGenModule::setTLSMode(llvm::GlobalValue *GV, const VarDecl &D) const { assert(D.getTLSKind() && "setting TLS mode on non-TLS var!"); llvm::GlobalValue::ThreadLocalMode TLM; TLM = GetDefaultLLVMTLSModel(); // Override the TLS model if it is explicitly specified. if (const TLSModelAttr *Attr = D.getAttr()) { TLM = GetLLVMTLSModel(Attr->getModel()); } GV->setThreadLocalMode(TLM); } static std::string getCPUSpecificMangling(const CodeGenModule &CGM, StringRef Name) { const TargetInfo &Target = CGM.getTarget(); return (Twine('.') + Twine(Target.CPUSpecificManglingCharacter(Name))).str(); } static void AppendCPUSpecificCPUDispatchMangling(const CodeGenModule &CGM, const CPUSpecificAttr *Attr, unsigned CPUIndex, raw_ostream &Out) { // cpu_specific gets the current name, dispatch gets the resolver if IFunc is // supported. if (Attr) Out << getCPUSpecificMangling(CGM, Attr->getCPUName(CPUIndex)->getName()); else if (CGM.getTarget().supportsIFunc()) Out << ".resolver"; } static void AppendTargetVersionMangling(const CodeGenModule &CGM, const TargetVersionAttr *Attr, raw_ostream &Out) { if (Attr->isDefaultVersion()) return; Out << "._"; const TargetInfo &TI = CGM.getTarget(); llvm::SmallVector Feats; Attr->getFeatures(Feats); llvm::stable_sort(Feats, [&TI](const StringRef FeatL, const StringRef FeatR) { return TI.multiVersionSortPriority(FeatL) < TI.multiVersionSortPriority(FeatR); }); for (const auto &Feat : Feats) { Out << 'M'; Out << Feat; } } static void AppendTargetMangling(const CodeGenModule &CGM, const TargetAttr *Attr, raw_ostream &Out) { if (Attr->isDefaultVersion()) return; Out << '.'; const TargetInfo &Target = CGM.getTarget(); ParsedTargetAttr Info = Target.parseTargetAttr(Attr->getFeaturesStr()); llvm::sort(Info.Features, [&Target](StringRef LHS, StringRef RHS) { // Multiversioning doesn't allow "no-${feature}", so we can // only have "+" prefixes here. assert(LHS.startswith("+") && RHS.startswith("+") && "Features should always have a prefix."); return Target.multiVersionSortPriority(LHS.substr(1)) > Target.multiVersionSortPriority(RHS.substr(1)); }); bool IsFirst = true; if (!Info.CPU.empty()) { IsFirst = false; Out << "arch_" << Info.CPU; } for (StringRef Feat : Info.Features) { if (!IsFirst) Out << '_'; IsFirst = false; Out << Feat.substr(1); } } // Returns true if GD is a function decl with internal linkage and // needs a unique suffix after the mangled name. static bool isUniqueInternalLinkageDecl(GlobalDecl GD, CodeGenModule &CGM) { const Decl *D = GD.getDecl(); return !CGM.getModuleNameHash().empty() && isa(D) && (CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage); } static void AppendTargetClonesMangling(const CodeGenModule &CGM, const TargetClonesAttr *Attr, unsigned VersionIndex, raw_ostream &Out) { const TargetInfo &TI = CGM.getTarget(); if (TI.getTriple().isAArch64()) { StringRef FeatureStr = Attr->getFeatureStr(VersionIndex); if (FeatureStr == "default") return; Out << "._"; SmallVector Features; FeatureStr.split(Features, "+"); llvm::stable_sort(Features, [&TI](const StringRef FeatL, const StringRef FeatR) { return TI.multiVersionSortPriority(FeatL) < TI.multiVersionSortPriority(FeatR); }); for (auto &Feat : Features) { Out << 'M'; Out << Feat; } } else { Out << '.'; StringRef FeatureStr = Attr->getFeatureStr(VersionIndex); if (FeatureStr.startswith("arch=")) Out << "arch_" << FeatureStr.substr(sizeof("arch=") - 1); else Out << FeatureStr; Out << '.' << Attr->getMangledIndex(VersionIndex); } } static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD, const NamedDecl *ND, bool OmitMultiVersionMangling = false) { SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); MangleContext &MC = CGM.getCXXABI().getMangleContext(); if (!CGM.getModuleNameHash().empty()) MC.needsUniqueInternalLinkageNames(); bool ShouldMangle = MC.shouldMangleDeclName(ND); if (ShouldMangle) MC.mangleName(GD.getWithDecl(ND), Out); else { IdentifierInfo *II = ND->getIdentifier(); assert(II && "Attempt to mangle unnamed decl."); const auto *FD = dyn_cast(ND); if (FD && FD->getType()->castAs()->getCallConv() == CC_X86RegCall) { Out << "__regcall3__" << II->getName(); } else if (FD && FD->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { Out << "__device_stub__" << II->getName(); } else { Out << II->getName(); } } // Check if the module name hash should be appended for internal linkage // symbols. This should come before multi-version target suffixes are // appended. This is to keep the name and module hash suffix of the // internal linkage function together. The unique suffix should only be // added when name mangling is done to make sure that the final name can // be properly demangled. For example, for C functions without prototypes, // name mangling is not done and the unique suffix should not be appeneded // then. if (ShouldMangle && isUniqueInternalLinkageDecl(GD, CGM)) { assert(CGM.getCodeGenOpts().UniqueInternalLinkageNames && "Hash computed when not explicitly requested"); Out << CGM.getModuleNameHash(); } if (const auto *FD = dyn_cast(ND)) if (FD->isMultiVersion() && !OmitMultiVersionMangling) { switch (FD->getMultiVersionKind()) { case MultiVersionKind::CPUDispatch: case MultiVersionKind::CPUSpecific: AppendCPUSpecificCPUDispatchMangling(CGM, FD->getAttr(), GD.getMultiVersionIndex(), Out); break; case MultiVersionKind::Target: AppendTargetMangling(CGM, FD->getAttr(), Out); break; case MultiVersionKind::TargetVersion: AppendTargetVersionMangling(CGM, FD->getAttr(), Out); break; case MultiVersionKind::TargetClones: AppendTargetClonesMangling(CGM, FD->getAttr(), GD.getMultiVersionIndex(), Out); break; case MultiVersionKind::None: llvm_unreachable("None multiversion type isn't valid here"); } } // Make unique name for device side static file-scope variable for HIP. if (CGM.getContext().shouldExternalize(ND) && CGM.getLangOpts().GPURelocatableDeviceCode && CGM.getLangOpts().CUDAIsDevice) CGM.printPostfixForExternalizedDecl(Out, ND); return std::string(Out.str()); } void CodeGenModule::UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD, StringRef &CurName) { if (!FD->isMultiVersion()) return; // Get the name of what this would be without the 'target' attribute. This // allows us to lookup the version that was emitted when this wasn't a // multiversion function. std::string NonTargetName = getMangledNameImpl(*this, GD, FD, /*OmitMultiVersionMangling=*/true); GlobalDecl OtherGD; if (lookupRepresentativeDecl(NonTargetName, OtherGD)) { assert(OtherGD.getCanonicalDecl() .getDecl() ->getAsFunction() ->isMultiVersion() && "Other GD should now be a multiversioned function"); // OtherFD is the version of this function that was mangled BEFORE // becoming a MultiVersion function. It potentially needs to be updated. const FunctionDecl *OtherFD = OtherGD.getCanonicalDecl() .getDecl() ->getAsFunction() ->getMostRecentDecl(); std::string OtherName = getMangledNameImpl(*this, OtherGD, OtherFD); // This is so that if the initial version was already the 'default' // version, we don't try to update it. if (OtherName != NonTargetName) { // Remove instead of erase, since others may have stored the StringRef // to this. const auto ExistingRecord = Manglings.find(NonTargetName); if (ExistingRecord != std::end(Manglings)) Manglings.remove(&(*ExistingRecord)); auto Result = Manglings.insert(std::make_pair(OtherName, OtherGD)); StringRef OtherNameRef = MangledDeclNames[OtherGD.getCanonicalDecl()] = Result.first->first(); // If this is the current decl is being created, make sure we update the name. if (GD.getCanonicalDecl() == OtherGD.getCanonicalDecl()) CurName = OtherNameRef; if (llvm::GlobalValue *Entry = GetGlobalValue(NonTargetName)) Entry->setName(OtherName); } } } StringRef CodeGenModule::getMangledName(GlobalDecl GD) { GlobalDecl CanonicalGD = GD.getCanonicalDecl(); // Some ABIs don't have constructor variants. Make sure that base and // complete constructors get mangled the same. if (const auto *CD = dyn_cast(CanonicalGD.getDecl())) { if (!getTarget().getCXXABI().hasConstructorVariants()) { CXXCtorType OrigCtorType = GD.getCtorType(); assert(OrigCtorType == Ctor_Base || OrigCtorType == Ctor_Complete); if (OrigCtorType == Ctor_Base) CanonicalGD = GlobalDecl(CD, Ctor_Complete); } } // In CUDA/HIP device compilation with -fgpu-rdc, the mangled name of a // static device variable depends on whether the variable is referenced by // a host or device host function. Therefore the mangled name cannot be // cached. if (!LangOpts.CUDAIsDevice || !getContext().mayExternalize(GD.getDecl())) { auto FoundName = MangledDeclNames.find(CanonicalGD); if (FoundName != MangledDeclNames.end()) return FoundName->second; } // Keep the first result in the case of a mangling collision. const auto *ND = cast(GD.getDecl()); std::string MangledName = getMangledNameImpl(*this, GD, ND); // Ensure either we have different ABIs between host and device compilations, // says host compilation following MSVC ABI but device compilation follows // Itanium C++ ABI or, if they follow the same ABI, kernel names after // mangling should be the same after name stubbing. The later checking is // very important as the device kernel name being mangled in host-compilation // is used to resolve the device binaries to be executed. Inconsistent naming // result in undefined behavior. Even though we cannot check that naming // directly between host- and device-compilations, the host- and // device-mangling in host compilation could help catching certain ones. assert(!isa(ND) || !ND->hasAttr() || getContext().shouldExternalize(ND) || getLangOpts().CUDAIsDevice || (getContext().getAuxTargetInfo() && (getContext().getAuxTargetInfo()->getCXXABI() != getContext().getTargetInfo().getCXXABI())) || getCUDARuntime().getDeviceSideName(ND) == getMangledNameImpl( *this, GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel), ND)); auto Result = Manglings.insert(std::make_pair(MangledName, GD)); return MangledDeclNames[CanonicalGD] = Result.first->first(); } StringRef CodeGenModule::getBlockMangledName(GlobalDecl GD, const BlockDecl *BD) { MangleContext &MangleCtx = getCXXABI().getMangleContext(); const Decl *D = GD.getDecl(); SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); if (!D) MangleCtx.mangleGlobalBlock(BD, dyn_cast_or_null(initializedGlobalDecl.getDecl()), Out); else if (const auto *CD = dyn_cast(D)) MangleCtx.mangleCtorBlock(CD, GD.getCtorType(), BD, Out); else if (const auto *DD = dyn_cast(D)) MangleCtx.mangleDtorBlock(DD, GD.getDtorType(), BD, Out); else MangleCtx.mangleBlock(cast(D), BD, Out); auto Result = Manglings.insert(std::make_pair(Out.str(), BD)); return Result.first->first(); } const GlobalDecl CodeGenModule::getMangledNameDecl(StringRef Name) { auto it = MangledDeclNames.begin(); while (it != MangledDeclNames.end()) { if (it->second == Name) return it->first; it++; } return GlobalDecl(); } llvm::GlobalValue *CodeGenModule::GetGlobalValue(StringRef Name) { return getModule().getNamedValue(Name); } /// AddGlobalCtor - Add a function to the list that will be called before /// main() runs. void CodeGenModule::AddGlobalCtor(llvm::Function *Ctor, int Priority, unsigned LexOrder, llvm::Constant *AssociatedData) { // FIXME: Type coercion of void()* types. GlobalCtors.push_back(Structor(Priority, LexOrder, Ctor, AssociatedData)); } /// AddGlobalDtor - Add a function to the list that will be called /// when the module is unloaded. void CodeGenModule::AddGlobalDtor(llvm::Function *Dtor, int Priority, bool IsDtorAttrFunc) { if (CodeGenOpts.RegisterGlobalDtorsWithAtExit && (!getContext().getTargetInfo().getTriple().isOSAIX() || IsDtorAttrFunc)) { DtorsUsingAtExit[Priority].push_back(Dtor); return; } // FIXME: Type coercion of void()* types. GlobalDtors.push_back(Structor(Priority, ~0U, Dtor, nullptr)); } void CodeGenModule::EmitCtorList(CtorList &Fns, const char *GlobalName) { if (Fns.empty()) return; // Ctor function type is void()*. llvm::FunctionType* CtorFTy = llvm::FunctionType::get(VoidTy, false); llvm::Type *CtorPFTy = llvm::PointerType::get(CtorFTy, TheModule.getDataLayout().getProgramAddressSpace()); // Get the type of a ctor entry, { i32, void ()*, i8* }. llvm::StructType *CtorStructTy = llvm::StructType::get( Int32Ty, CtorPFTy, VoidPtrTy); // Construct the constructor and destructor arrays. ConstantInitBuilder builder(*this); auto ctors = builder.beginArray(CtorStructTy); for (const auto &I : Fns) { auto ctor = ctors.beginStruct(CtorStructTy); ctor.addInt(Int32Ty, I.Priority); ctor.add(llvm::ConstantExpr::getBitCast(I.Initializer, CtorPFTy)); if (I.AssociatedData) ctor.add(llvm::ConstantExpr::getBitCast(I.AssociatedData, VoidPtrTy)); else ctor.addNullPointer(VoidPtrTy); ctor.finishAndAddTo(ctors); } auto list = ctors.finishAndCreateGlobal(GlobalName, getPointerAlign(), /*constant*/ false, llvm::GlobalValue::AppendingLinkage); // The LTO linker doesn't seem to like it when we set an alignment // on appending variables. Take it off as a workaround. list->setAlignment(std::nullopt); Fns.clear(); } llvm::GlobalValue::LinkageTypes CodeGenModule::getFunctionLinkage(GlobalDecl GD) { const auto *D = cast(GD.getDecl()); GVALinkage Linkage = getContext().GetGVALinkageForFunction(D); if (const auto *Dtor = dyn_cast(D)) return getCXXABI().getCXXDestructorLinkage(Linkage, Dtor, GD.getDtorType()); if (isa(D) && cast(D)->isInheritingConstructor() && Context.getTargetInfo().getCXXABI().isMicrosoft()) { // Our approach to inheriting constructors is fundamentally different from // that used by the MS ABI, so keep our inheriting constructor thunks // internal rather than trying to pick an unambiguous mangling for them. return llvm::GlobalValue::InternalLinkage; } return getLLVMLinkageForDeclarator(D, Linkage); } llvm::ConstantInt *CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata *MD) { llvm::MDString *MDS = dyn_cast(MD); if (!MDS) return nullptr; return llvm::ConstantInt::get(Int64Ty, llvm::MD5Hash(MDS->getString())); } llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T) { if (auto *FnType = T->getAs()) T = getContext().getFunctionType( FnType->getReturnType(), FnType->getParamTypes(), FnType->getExtProtoInfo().withExceptionSpec(EST_None)); std::string OutName; llvm::raw_string_ostream Out(OutName); getCXXABI().getMangleContext().mangleTypeName( T, Out, getCodeGenOpts().SanitizeCfiICallNormalizeIntegers); if (getCodeGenOpts().SanitizeCfiICallNormalizeIntegers) Out << ".normalized"; return llvm::ConstantInt::get(Int32Ty, static_cast(llvm::xxHash64(OutName))); } void CodeGenModule::SetLLVMFunctionAttributes(GlobalDecl GD, const CGFunctionInfo &Info, llvm::Function *F, bool IsThunk) { unsigned CallingConv; llvm::AttributeList PAL; ConstructAttributeList(F->getName(), Info, GD, PAL, CallingConv, /*AttrOnCallSite=*/false, IsThunk); F->setAttributes(PAL); F->setCallingConv(static_cast(CallingConv)); } static void removeImageAccessQualifier(std::string& TyName) { std::string ReadOnlyQual("__read_only"); std::string::size_type ReadOnlyPos = TyName.find(ReadOnlyQual); if (ReadOnlyPos != std::string::npos) // "+ 1" for the space after access qualifier. TyName.erase(ReadOnlyPos, ReadOnlyQual.size() + 1); else { std::string WriteOnlyQual("__write_only"); std::string::size_type WriteOnlyPos = TyName.find(WriteOnlyQual); if (WriteOnlyPos != std::string::npos) TyName.erase(WriteOnlyPos, WriteOnlyQual.size() + 1); else { std::string ReadWriteQual("__read_write"); std::string::size_type ReadWritePos = TyName.find(ReadWriteQual); if (ReadWritePos != std::string::npos) TyName.erase(ReadWritePos, ReadWriteQual.size() + 1); } } } // Returns the address space id that should be produced to the // kernel_arg_addr_space metadata. This is always fixed to the ids // as specified in the SPIR 2.0 specification in order to differentiate // for example in clGetKernelArgInfo() implementation between the address // spaces with targets without unique mapping to the OpenCL address spaces // (basically all single AS CPUs). static unsigned ArgInfoAddressSpace(LangAS AS) { switch (AS) { case LangAS::opencl_global: return 1; case LangAS::opencl_constant: return 2; case LangAS::opencl_local: return 3; case LangAS::opencl_generic: return 4; // Not in SPIR 2.0 specs. case LangAS::opencl_global_device: return 5; case LangAS::opencl_global_host: return 6; default: return 0; // Assume private. } } void CodeGenModule::GenKernelArgMetadata(llvm::Function *Fn, const FunctionDecl *FD, CodeGenFunction *CGF) { assert(((FD && CGF) || (!FD && !CGF)) && "Incorrect use - FD and CGF should either be both null or not!"); // Create MDNodes that represent the kernel arg metadata. // Each MDNode is a list in the form of "key", N number of values which is // the same number of values as their are kernel arguments. const PrintingPolicy &Policy = Context.getPrintingPolicy(); // MDNode for the kernel argument address space qualifiers. SmallVector addressQuals; // MDNode for the kernel argument access qualifiers (images only). SmallVector accessQuals; // MDNode for the kernel argument type names. SmallVector argTypeNames; // MDNode for the kernel argument base type names. SmallVector argBaseTypeNames; // MDNode for the kernel argument type qualifiers. SmallVector argTypeQuals; // MDNode for the kernel argument names. SmallVector argNames; if (FD && CGF) for (unsigned i = 0, e = FD->getNumParams(); i != e; ++i) { const ParmVarDecl *parm = FD->getParamDecl(i); // Get argument name. argNames.push_back(llvm::MDString::get(VMContext, parm->getName())); if (!getLangOpts().OpenCL) continue; QualType ty = parm->getType(); std::string typeQuals; // Get image and pipe access qualifier: if (ty->isImageType() || ty->isPipeType()) { const Decl *PDecl = parm; if (const auto *TD = ty->getAs()) PDecl = TD->getDecl(); const OpenCLAccessAttr *A = PDecl->getAttr(); if (A && A->isWriteOnly()) accessQuals.push_back(llvm::MDString::get(VMContext, "write_only")); else if (A && A->isReadWrite()) accessQuals.push_back(llvm::MDString::get(VMContext, "read_write")); else accessQuals.push_back(llvm::MDString::get(VMContext, "read_only")); } else accessQuals.push_back(llvm::MDString::get(VMContext, "none")); auto getTypeSpelling = [&](QualType Ty) { auto typeName = Ty.getUnqualifiedType().getAsString(Policy); if (Ty.isCanonical()) { StringRef typeNameRef = typeName; // Turn "unsigned type" to "utype" if (typeNameRef.consume_front("unsigned ")) return std::string("u") + typeNameRef.str(); if (typeNameRef.consume_front("signed ")) return typeNameRef.str(); } return typeName; }; if (ty->isPointerType()) { QualType pointeeTy = ty->getPointeeType(); // Get address qualifier. addressQuals.push_back( llvm::ConstantAsMetadata::get(CGF->Builder.getInt32( ArgInfoAddressSpace(pointeeTy.getAddressSpace())))); // Get argument type name. std::string typeName = getTypeSpelling(pointeeTy) + "*"; std::string baseTypeName = getTypeSpelling(pointeeTy.getCanonicalType()) + "*"; argTypeNames.push_back(llvm::MDString::get(VMContext, typeName)); argBaseTypeNames.push_back( llvm::MDString::get(VMContext, baseTypeName)); // Get argument type qualifiers: if (ty.isRestrictQualified()) typeQuals = "restrict"; if (pointeeTy.isConstQualified() || (pointeeTy.getAddressSpace() == LangAS::opencl_constant)) typeQuals += typeQuals.empty() ? "const" : " const"; if (pointeeTy.isVolatileQualified()) typeQuals += typeQuals.empty() ? "volatile" : " volatile"; } else { uint32_t AddrSpc = 0; bool isPipe = ty->isPipeType(); if (ty->isImageType() || isPipe) AddrSpc = ArgInfoAddressSpace(LangAS::opencl_global); addressQuals.push_back( llvm::ConstantAsMetadata::get(CGF->Builder.getInt32(AddrSpc))); // Get argument type name. ty = isPipe ? ty->castAs()->getElementType() : ty; std::string typeName = getTypeSpelling(ty); std::string baseTypeName = getTypeSpelling(ty.getCanonicalType()); // Remove access qualifiers on images // (as they are inseparable from type in clang implementation, // but OpenCL spec provides a special query to get access qualifier // via clGetKernelArgInfo with CL_KERNEL_ARG_ACCESS_QUALIFIER): if (ty->isImageType()) { removeImageAccessQualifier(typeName); removeImageAccessQualifier(baseTypeName); } argTypeNames.push_back(llvm::MDString::get(VMContext, typeName)); argBaseTypeNames.push_back( llvm::MDString::get(VMContext, baseTypeName)); if (isPipe) typeQuals = "pipe"; } argTypeQuals.push_back(llvm::MDString::get(VMContext, typeQuals)); } if (getLangOpts().OpenCL) { Fn->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(VMContext, addressQuals)); Fn->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(VMContext, accessQuals)); Fn->setMetadata("kernel_arg_type", llvm::MDNode::get(VMContext, argTypeNames)); Fn->setMetadata("kernel_arg_base_type", llvm::MDNode::get(VMContext, argBaseTypeNames)); Fn->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(VMContext, argTypeQuals)); } if (getCodeGenOpts().EmitOpenCLArgMetadata || getCodeGenOpts().HIPSaveKernelArgName) Fn->setMetadata("kernel_arg_name", llvm::MDNode::get(VMContext, argNames)); } /// Determines whether the language options require us to model /// unwind exceptions. We treat -fexceptions as mandating this /// except under the fragile ObjC ABI with only ObjC exceptions /// enabled. This means, for example, that C with -fexceptions /// enables this. static bool hasUnwindExceptions(const LangOptions &LangOpts) { // If exceptions are completely disabled, obviously this is false. if (!LangOpts.Exceptions) return false; // If C++ exceptions are enabled, this is true. if (LangOpts.CXXExceptions) return true; // If ObjC exceptions are enabled, this depends on the ABI. if (LangOpts.ObjCExceptions) { return LangOpts.ObjCRuntime.hasUnwindExceptions(); } return true; } static bool requiresMemberFunctionPointerTypeMetadata(CodeGenModule &CGM, const CXXMethodDecl *MD) { // Check that the type metadata can ever actually be used by a call. if (!CGM.getCodeGenOpts().LTOUnit || !CGM.HasHiddenLTOVisibility(MD->getParent())) return false; // Only functions whose address can be taken with a member function pointer // need this sort of type metadata. return !MD->isStatic() && !MD->isVirtual() && !isa(MD) && !isa(MD); } std::vector CodeGenModule::getMostBaseClasses(const CXXRecordDecl *RD) { llvm::SetVector MostBases; std::function CollectMostBases; CollectMostBases = [&](const CXXRecordDecl *RD) { if (RD->getNumBases() == 0) MostBases.insert(RD); for (const CXXBaseSpecifier &B : RD->bases()) CollectMostBases(B.getType()->getAsCXXRecordDecl()); }; CollectMostBases(RD); return MostBases.takeVector(); } void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D, llvm::Function *F) { llvm::AttrBuilder B(F->getContext()); if ((!D || !D->hasAttr()) && CodeGenOpts.UnwindTables) B.addUWTableAttr(llvm::UWTableKind(CodeGenOpts.UnwindTables)); if (CodeGenOpts.StackClashProtector) B.addAttribute("probe-stack", "inline-asm"); if (!hasUnwindExceptions(LangOpts)) B.addAttribute(llvm::Attribute::NoUnwind); if (D && D->hasAttr()) ; // Do nothing. else if (D && D->hasAttr() && LangOpts.getStackProtector() == LangOptions::SSPOn) B.addAttribute(llvm::Attribute::StackProtectStrong); else if (LangOpts.getStackProtector() == LangOptions::SSPOn) B.addAttribute(llvm::Attribute::StackProtect); else if (LangOpts.getStackProtector() == LangOptions::SSPStrong) B.addAttribute(llvm::Attribute::StackProtectStrong); else if (LangOpts.getStackProtector() == LangOptions::SSPReq) B.addAttribute(llvm::Attribute::StackProtectReq); if (!D) { // If we don't have a declaration to control inlining, the function isn't // explicitly marked as alwaysinline for semantic reasons, and inlining is // disabled, mark the function as noinline. if (!F->hasFnAttribute(llvm::Attribute::AlwaysInline) && CodeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining) B.addAttribute(llvm::Attribute::NoInline); F->addFnAttrs(B); return; } // Track whether we need to add the optnone LLVM attribute, // starting with the default for this optimization level. bool ShouldAddOptNone = !CodeGenOpts.DisableO0ImplyOptNone && CodeGenOpts.OptimizationLevel == 0; // We can't add optnone in the following cases, it won't pass the verifier. ShouldAddOptNone &= !D->hasAttr(); ShouldAddOptNone &= !D->hasAttr(); // Add optnone, but do so only if the function isn't always_inline. if ((ShouldAddOptNone || D->hasAttr()) && !F->hasFnAttribute(llvm::Attribute::AlwaysInline)) { B.addAttribute(llvm::Attribute::OptimizeNone); // OptimizeNone implies noinline; we should not be inlining such functions. B.addAttribute(llvm::Attribute::NoInline); // We still need to handle naked functions even though optnone subsumes // much of their semantics. if (D->hasAttr()) B.addAttribute(llvm::Attribute::Naked); // OptimizeNone wins over OptimizeForSize and MinSize. F->removeFnAttr(llvm::Attribute::OptimizeForSize); F->removeFnAttr(llvm::Attribute::MinSize); } else if (D->hasAttr()) { // Naked implies noinline: we should not be inlining such functions. B.addAttribute(llvm::Attribute::Naked); B.addAttribute(llvm::Attribute::NoInline); } else if (D->hasAttr()) { B.addAttribute(llvm::Attribute::NoDuplicate); } else if (D->hasAttr() && !F->hasFnAttribute(llvm::Attribute::AlwaysInline)) { // Add noinline if the function isn't always_inline. B.addAttribute(llvm::Attribute::NoInline); } else if (D->hasAttr() && !F->hasFnAttribute(llvm::Attribute::NoInline)) { // (noinline wins over always_inline, and we can't specify both in IR) B.addAttribute(llvm::Attribute::AlwaysInline); } else if (CodeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining) { // If we're not inlining, then force everything that isn't always_inline to // carry an explicit noinline attribute. if (!F->hasFnAttribute(llvm::Attribute::AlwaysInline)) B.addAttribute(llvm::Attribute::NoInline); } else { // Otherwise, propagate the inline hint attribute and potentially use its // absence to mark things as noinline. if (auto *FD = dyn_cast(D)) { // Search function and template pattern redeclarations for inline. auto CheckForInline = [](const FunctionDecl *FD) { auto CheckRedeclForInline = [](const FunctionDecl *Redecl) { return Redecl->isInlineSpecified(); }; if (any_of(FD->redecls(), CheckRedeclForInline)) return true; const FunctionDecl *Pattern = FD->getTemplateInstantiationPattern(); if (!Pattern) return false; return any_of(Pattern->redecls(), CheckRedeclForInline); }; if (CheckForInline(FD)) { B.addAttribute(llvm::Attribute::InlineHint); } else if (CodeGenOpts.getInlining() == CodeGenOptions::OnlyHintInlining && !FD->isInlined() && !F->hasFnAttribute(llvm::Attribute::AlwaysInline)) { B.addAttribute(llvm::Attribute::NoInline); } } } // Add other optimization related attributes if we are optimizing this // function. if (!D->hasAttr()) { if (D->hasAttr()) { if (!ShouldAddOptNone) B.addAttribute(llvm::Attribute::OptimizeForSize); B.addAttribute(llvm::Attribute::Cold); } if (D->hasAttr()) B.addAttribute(llvm::Attribute::Hot); if (D->hasAttr()) B.addAttribute(llvm::Attribute::MinSize); } F->addFnAttrs(B); unsigned alignment = D->getMaxAlignment() / Context.getCharWidth(); if (alignment) F->setAlignment(llvm::Align(alignment)); if (!D->hasAttr()) if (LangOpts.FunctionAlignment) F->setAlignment(llvm::Align(1ull << LangOpts.FunctionAlignment)); // Some C++ ABIs require 2-byte alignment for member functions, in order to // reserve a bit for differentiating between virtual and non-virtual member // functions. If the current target's C++ ABI requires this and this is a // member function, set its alignment accordingly. if (getTarget().getCXXABI().areMemberFunctionsAligned()) { - if (F->getPointerAlignment(getDataLayout()) < 2 && isa(D)) + if (isa(D) && F->getPointerAlignment(getDataLayout()) < 2) F->setAlignment(std::max(llvm::Align(2), F->getAlign().valueOrOne())); } // In the cross-dso CFI mode with canonical jump tables, we want !type // attributes on definitions only. if (CodeGenOpts.SanitizeCfiCrossDso && CodeGenOpts.SanitizeCfiCanonicalJumpTables) { if (auto *FD = dyn_cast(D)) { // Skip available_externally functions. They won't be codegen'ed in the // current module anyway. if (getContext().GetGVALinkageForFunction(FD) != GVA_AvailableExternally) CreateFunctionTypeMetadataForIcall(FD, F); } } // Emit type metadata on member functions for member function pointer checks. // These are only ever necessary on definitions; we're guaranteed that the // definition will be present in the LTO unit as a result of LTO visibility. auto *MD = dyn_cast(D); if (MD && requiresMemberFunctionPointerTypeMetadata(*this, MD)) { for (const CXXRecordDecl *Base : getMostBaseClasses(MD->getParent())) { llvm::Metadata *Id = CreateMetadataIdentifierForType(Context.getMemberPointerType( MD->getType(), Context.getRecordType(Base).getTypePtr())); F->addTypeMetadata(0, Id); } } } void CodeGenModule::SetCommonAttributes(GlobalDecl GD, llvm::GlobalValue *GV) { const Decl *D = GD.getDecl(); if (isa_and_nonnull(D)) setGVProperties(GV, GD); else GV->setVisibility(llvm::GlobalValue::DefaultVisibility); if (D && D->hasAttr()) addUsedOrCompilerUsedGlobal(GV); if (const auto *VD = dyn_cast_if_present(D); VD && ((CodeGenOpts.KeepPersistentStorageVariables && (VD->getStorageDuration() == SD_Static || VD->getStorageDuration() == SD_Thread)) || (CodeGenOpts.KeepStaticConsts && VD->getStorageDuration() == SD_Static && VD->getType().isConstQualified()))) addUsedOrCompilerUsedGlobal(GV); } bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD, llvm::AttrBuilder &Attrs, bool SetTargetFeatures) { // Add target-cpu and target-features attributes to functions. If // we have a decl for the function and it has a target attribute then // parse that and add it to the feature set. StringRef TargetCPU = getTarget().getTargetOpts().CPU; StringRef TuneCPU = getTarget().getTargetOpts().TuneCPU; std::vector Features; const auto *FD = dyn_cast_or_null(GD.getDecl()); FD = FD ? FD->getMostRecentDecl() : FD; const auto *TD = FD ? FD->getAttr() : nullptr; const auto *TV = FD ? FD->getAttr() : nullptr; assert((!TD || !TV) && "both target_version and target specified"); const auto *SD = FD ? FD->getAttr() : nullptr; const auto *TC = FD ? FD->getAttr() : nullptr; bool AddedAttr = false; if (TD || TV || SD || TC) { llvm::StringMap FeatureMap; getContext().getFunctionFeatureMap(FeatureMap, GD); // Produce the canonical string for this set of features. for (const llvm::StringMap::value_type &Entry : FeatureMap) Features.push_back((Entry.getValue() ? "+" : "-") + Entry.getKey().str()); // Now add the target-cpu and target-features to the function. // While we populated the feature map above, we still need to // get and parse the target attribute so we can get the cpu for // the function. if (TD) { ParsedTargetAttr ParsedAttr = Target.parseTargetAttr(TD->getFeaturesStr()); if (!ParsedAttr.CPU.empty() && getTarget().isValidCPUName(ParsedAttr.CPU)) { TargetCPU = ParsedAttr.CPU; TuneCPU = ""; // Clear the tune CPU. } if (!ParsedAttr.Tune.empty() && getTarget().isValidCPUName(ParsedAttr.Tune)) TuneCPU = ParsedAttr.Tune; } if (SD) { // Apply the given CPU name as the 'tune-cpu' so that the optimizer can // favor this processor. TuneCPU = SD->getCPUName(GD.getMultiVersionIndex())->getName(); } } else { // Otherwise just add the existing target cpu and target features to the // function. Features = getTarget().getTargetOpts().Features; } if (!TargetCPU.empty()) { Attrs.addAttribute("target-cpu", TargetCPU); AddedAttr = true; } if (!TuneCPU.empty()) { Attrs.addAttribute("tune-cpu", TuneCPU); AddedAttr = true; } if (!Features.empty() && SetTargetFeatures) { llvm::erase_if(Features, [&](const std::string& F) { return getTarget().isReadOnlyFeature(F.substr(1)); }); llvm::sort(Features); Attrs.addAttribute("target-features", llvm::join(Features, ",")); AddedAttr = true; } return AddedAttr; } void CodeGenModule::setNonAliasAttributes(GlobalDecl GD, llvm::GlobalObject *GO) { const Decl *D = GD.getDecl(); SetCommonAttributes(GD, GO); if (D) { if (auto *GV = dyn_cast(GO)) { if (D->hasAttr()) addUsedGlobal(GV); if (auto *SA = D->getAttr()) GV->addAttribute("bss-section", SA->getName()); if (auto *SA = D->getAttr()) GV->addAttribute("data-section", SA->getName()); if (auto *SA = D->getAttr()) GV->addAttribute("rodata-section", SA->getName()); if (auto *SA = D->getAttr()) GV->addAttribute("relro-section", SA->getName()); } if (auto *F = dyn_cast(GO)) { if (D->hasAttr()) addUsedGlobal(F); if (auto *SA = D->getAttr()) if (!D->getAttr()) F->addFnAttr("implicit-section-name", SA->getName()); llvm::AttrBuilder Attrs(F->getContext()); if (GetCPUAndFeaturesAttributes(GD, Attrs)) { // We know that GetCPUAndFeaturesAttributes will always have the // newest set, since it has the newest possible FunctionDecl, so the // new ones should replace the old. llvm::AttributeMask RemoveAttrs; RemoveAttrs.addAttribute("target-cpu"); RemoveAttrs.addAttribute("target-features"); RemoveAttrs.addAttribute("tune-cpu"); F->removeFnAttrs(RemoveAttrs); F->addFnAttrs(Attrs); } } if (const auto *CSA = D->getAttr()) GO->setSection(CSA->getName()); else if (const auto *SA = D->getAttr()) GO->setSection(SA->getName()); } getTargetCodeGenInfo().setTargetAttributes(D, GO, *this); } void CodeGenModule::SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI) { const Decl *D = GD.getDecl(); SetLLVMFunctionAttributes(GD, FI, F, /*IsThunk=*/false); SetLLVMFunctionAttributesForDefinition(D, F); F->setLinkage(llvm::Function::InternalLinkage); setNonAliasAttributes(GD, F); } static void setLinkageForGV(llvm::GlobalValue *GV, const NamedDecl *ND) { // Set linkage and visibility in case we never see a definition. LinkageInfo LV = ND->getLinkageAndVisibility(); // Don't set internal linkage on declarations. // "extern_weak" is overloaded in LLVM; we probably should have // separate linkage types for this. if (isExternallyVisible(LV.getLinkage()) && (ND->hasAttr() || ND->isWeakImported())) GV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage); } void CodeGenModule::CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD, llvm::Function *F) { // Only if we are checking indirect calls. if (!LangOpts.Sanitize.has(SanitizerKind::CFIICall)) return; // Non-static class methods are handled via vtable or member function pointer // checks elsewhere. if (isa(FD) && !cast(FD)->isStatic()) return; llvm::Metadata *MD = CreateMetadataIdentifierForType(FD->getType()); F->addTypeMetadata(0, MD); F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(FD->getType())); // Emit a hash-based bit set entry for cross-DSO calls. if (CodeGenOpts.SanitizeCfiCrossDso) if (auto CrossDsoTypeId = CreateCrossDsoCfiTypeId(MD)) F->addTypeMetadata(0, llvm::ConstantAsMetadata::get(CrossDsoTypeId)); } void CodeGenModule::setKCFIType(const FunctionDecl *FD, llvm::Function *F) { llvm::LLVMContext &Ctx = F->getContext(); llvm::MDBuilder MDB(Ctx); F->setMetadata(llvm::LLVMContext::MD_kcfi_type, llvm::MDNode::get( Ctx, MDB.createConstant(CreateKCFITypeId(FD->getType())))); } static bool allowKCFIIdentifier(StringRef Name) { // KCFI type identifier constants are only necessary for external assembly // functions, which means it's safe to skip unusual names. Subset of // MCAsmInfo::isAcceptableChar() and MCAsmInfoXCOFF::isAcceptableChar(). return llvm::all_of(Name, [](const char &C) { return llvm::isAlnum(C) || C == '_' || C == '.'; }); } void CodeGenModule::finalizeKCFITypes() { llvm::Module &M = getModule(); for (auto &F : M.functions()) { // Remove KCFI type metadata from non-address-taken local functions. bool AddressTaken = F.hasAddressTaken(); if (!AddressTaken && F.hasLocalLinkage()) F.eraseMetadata(llvm::LLVMContext::MD_kcfi_type); // Generate a constant with the expected KCFI type identifier for all // address-taken function declarations to support annotating indirectly // called assembly functions. if (!AddressTaken || !F.isDeclaration()) continue; const llvm::ConstantInt *Type; if (const llvm::MDNode *MD = F.getMetadata(llvm::LLVMContext::MD_kcfi_type)) Type = llvm::mdconst::extract(MD->getOperand(0)); else continue; StringRef Name = F.getName(); if (!allowKCFIIdentifier(Name)) continue; std::string Asm = (".weak __kcfi_typeid_" + Name + "\n.set __kcfi_typeid_" + Name + ", " + Twine(Type->getZExtValue()) + "\n") .str(); M.appendModuleInlineAsm(Asm); } } void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F, bool IsIncompleteFunction, bool IsThunk) { if (llvm::Intrinsic::ID IID = F->getIntrinsicID()) { // If this is an intrinsic function, set the function's attributes // to the intrinsic's attributes. F->setAttributes(llvm::Intrinsic::getAttributes(getLLVMContext(), IID)); return; } const auto *FD = cast(GD.getDecl()); if (!IsIncompleteFunction) SetLLVMFunctionAttributes(GD, getTypes().arrangeGlobalDeclaration(GD), F, IsThunk); // Add the Returned attribute for "this", except for iOS 5 and earlier // where substantial code, including the libstdc++ dylib, was compiled with // GCC and does not actually return "this". if (!IsThunk && getCXXABI().HasThisReturn(GD) && !(getTriple().isiOS() && getTriple().isOSVersionLT(6))) { assert(!F->arg_empty() && F->arg_begin()->getType() ->canLosslesslyBitCastTo(F->getReturnType()) && "unexpected this return"); F->addParamAttr(0, llvm::Attribute::Returned); } // Only a few attributes are set on declarations; these may later be // overridden by a definition. setLinkageForGV(F, FD); setGVProperties(F, FD); // Setup target-specific attributes. if (!IsIncompleteFunction && F->isDeclaration()) getTargetCodeGenInfo().setTargetAttributes(FD, F, *this); if (const auto *CSA = FD->getAttr()) F->setSection(CSA->getName()); else if (const auto *SA = FD->getAttr()) F->setSection(SA->getName()); if (const auto *EA = FD->getAttr()) { if (EA->isError()) F->addFnAttr("dontcall-error", EA->getUserDiagnostic()); else if (EA->isWarning()) F->addFnAttr("dontcall-warn", EA->getUserDiagnostic()); } // If we plan on emitting this inline builtin, we can't treat it as a builtin. if (FD->isInlineBuiltinDeclaration()) { const FunctionDecl *FDBody; bool HasBody = FD->hasBody(FDBody); (void)HasBody; assert(HasBody && "Inline builtin declarations should always have an " "available body!"); if (shouldEmitFunction(FDBody)) F->addFnAttr(llvm::Attribute::NoBuiltin); } if (FD->isReplaceableGlobalAllocationFunction()) { // A replaceable global allocation function does not act like a builtin by // default, only if it is invoked by a new-expression or delete-expression. F->addFnAttr(llvm::Attribute::NoBuiltin); } if (isa(FD) || isa(FD)) F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); else if (const auto *MD = dyn_cast(FD)) if (MD->isVirtual()) F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); // Don't emit entries for function declarations in the cross-DSO mode. This // is handled with better precision by the receiving DSO. But if jump tables // are non-canonical then we need type metadata in order to produce the local // jump table. if (!CodeGenOpts.SanitizeCfiCrossDso || !CodeGenOpts.SanitizeCfiCanonicalJumpTables) CreateFunctionTypeMetadataForIcall(FD, F); if (LangOpts.Sanitize.has(SanitizerKind::KCFI)) setKCFIType(FD, F); if (getLangOpts().OpenMP && FD->hasAttr()) getOpenMPRuntime().emitDeclareSimdFunction(FD, F); if (CodeGenOpts.InlineMaxStackSize != UINT_MAX) F->addFnAttr("inline-max-stacksize", llvm::utostr(CodeGenOpts.InlineMaxStackSize)); if (const auto *CB = FD->getAttr()) { // Annotate the callback behavior as metadata: // - The callback callee (as argument number). // - The callback payloads (as argument numbers). llvm::LLVMContext &Ctx = F->getContext(); llvm::MDBuilder MDB(Ctx); // The payload indices are all but the first one in the encoding. The first // identifies the callback callee. int CalleeIdx = *CB->encoding_begin(); ArrayRef PayloadIndices(CB->encoding_begin() + 1, CB->encoding_end()); F->addMetadata(llvm::LLVMContext::MD_callback, *llvm::MDNode::get(Ctx, {MDB.createCallbackEncoding( CalleeIdx, PayloadIndices, /* VarArgsArePassed */ false)})); } } void CodeGenModule::addUsedGlobal(llvm::GlobalValue *GV) { assert((isa(GV) || !GV->isDeclaration()) && "Only globals with definition can force usage."); LLVMUsed.emplace_back(GV); } void CodeGenModule::addCompilerUsedGlobal(llvm::GlobalValue *GV) { assert(!GV->isDeclaration() && "Only globals with definition can force usage."); LLVMCompilerUsed.emplace_back(GV); } void CodeGenModule::addUsedOrCompilerUsedGlobal(llvm::GlobalValue *GV) { assert((isa(GV) || !GV->isDeclaration()) && "Only globals with definition can force usage."); if (getTriple().isOSBinFormatELF()) LLVMCompilerUsed.emplace_back(GV); else LLVMUsed.emplace_back(GV); } static void emitUsed(CodeGenModule &CGM, StringRef Name, std::vector &List) { // Don't create llvm.used if there is no need. if (List.empty()) return; // Convert List to what ConstantArray needs. SmallVector UsedArray; UsedArray.resize(List.size()); for (unsigned i = 0, e = List.size(); i != e; ++i) { UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( cast(&*List[i]), CGM.Int8PtrTy); } if (UsedArray.empty()) return; llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size()); auto *GV = new llvm::GlobalVariable( CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage, llvm::ConstantArray::get(ATy, UsedArray), Name); GV->setSection("llvm.metadata"); } void CodeGenModule::emitLLVMUsed() { emitUsed(*this, "llvm.used", LLVMUsed); emitUsed(*this, "llvm.compiler.used", LLVMCompilerUsed); } void CodeGenModule::AppendLinkerOptions(StringRef Opts) { auto *MDOpts = llvm::MDString::get(getLLVMContext(), Opts); LinkerOptionsMetadata.push_back(llvm::MDNode::get(getLLVMContext(), MDOpts)); } void CodeGenModule::AddDetectMismatch(StringRef Name, StringRef Value) { llvm::SmallString<32> Opt; getTargetCodeGenInfo().getDetectMismatchOption(Name, Value, Opt); if (Opt.empty()) return; auto *MDOpts = llvm::MDString::get(getLLVMContext(), Opt); LinkerOptionsMetadata.push_back(llvm::MDNode::get(getLLVMContext(), MDOpts)); } void CodeGenModule::AddDependentLib(StringRef Lib) { auto &C = getLLVMContext(); if (getTarget().getTriple().isOSBinFormatELF()) { ELFDependentLibraries.push_back( llvm::MDNode::get(C, llvm::MDString::get(C, Lib))); return; } llvm::SmallString<24> Opt; getTargetCodeGenInfo().getDependentLibraryOption(Lib, Opt); auto *MDOpts = llvm::MDString::get(getLLVMContext(), Opt); LinkerOptionsMetadata.push_back(llvm::MDNode::get(C, MDOpts)); } /// Add link options implied by the given module, including modules /// it depends on, using a postorder walk. static void addLinkOptionsPostorder(CodeGenModule &CGM, Module *Mod, SmallVectorImpl &Metadata, llvm::SmallPtrSet &Visited) { // Import this module's parent. if (Mod->Parent && Visited.insert(Mod->Parent).second) { addLinkOptionsPostorder(CGM, Mod->Parent, Metadata, Visited); } // Import this module's dependencies. for (Module *Import : llvm::reverse(Mod->Imports)) { if (Visited.insert(Import).second) addLinkOptionsPostorder(CGM, Import, Metadata, Visited); } // Add linker options to link against the libraries/frameworks // described by this module. llvm::LLVMContext &Context = CGM.getLLVMContext(); bool IsELF = CGM.getTarget().getTriple().isOSBinFormatELF(); // For modules that use export_as for linking, use that module // name instead. if (Mod->UseExportAsModuleLinkName) return; for (const Module::LinkLibrary &LL : llvm::reverse(Mod->LinkLibraries)) { // Link against a framework. Frameworks are currently Darwin only, so we // don't to ask TargetCodeGenInfo for the spelling of the linker option. if (LL.IsFramework) { llvm::Metadata *Args[2] = {llvm::MDString::get(Context, "-framework"), llvm::MDString::get(Context, LL.Library)}; Metadata.push_back(llvm::MDNode::get(Context, Args)); continue; } // Link against a library. if (IsELF) { llvm::Metadata *Args[2] = { llvm::MDString::get(Context, "lib"), llvm::MDString::get(Context, LL.Library), }; Metadata.push_back(llvm::MDNode::get(Context, Args)); } else { llvm::SmallString<24> Opt; CGM.getTargetCodeGenInfo().getDependentLibraryOption(LL.Library, Opt); auto *OptString = llvm::MDString::get(Context, Opt); Metadata.push_back(llvm::MDNode::get(Context, OptString)); } } } void CodeGenModule::EmitModuleInitializers(clang::Module *Primary) { // Emit the initializers in the order that sub-modules appear in the // source, first Global Module Fragments, if present. if (auto GMF = Primary->getGlobalModuleFragment()) { for (Decl *D : getContext().getModuleInitializers(GMF)) { if (isa(D)) continue; assert(isa(D) && "GMF initializer decl is not a var?"); EmitTopLevelDecl(D); } } // Second any associated with the module, itself. for (Decl *D : getContext().getModuleInitializers(Primary)) { // Skip import decls, the inits for those are called explicitly. if (isa(D)) continue; EmitTopLevelDecl(D); } // Third any associated with the Privat eMOdule Fragment, if present. if (auto PMF = Primary->getPrivateModuleFragment()) { for (Decl *D : getContext().getModuleInitializers(PMF)) { assert(isa(D) && "PMF initializer decl is not a var?"); EmitTopLevelDecl(D); } } } void CodeGenModule::EmitModuleLinkOptions() { // Collect the set of all of the modules we want to visit to emit link // options, which is essentially the imported modules and all of their // non-explicit child modules. llvm::SetVector LinkModules; llvm::SmallPtrSet Visited; SmallVector Stack; // Seed the stack with imported modules. for (Module *M : ImportedModules) { // Do not add any link flags when an implementation TU of a module imports // a header of that same module. if (M->getTopLevelModuleName() == getLangOpts().CurrentModule && !getLangOpts().isCompilingModule()) continue; if (Visited.insert(M).second) Stack.push_back(M); } // Find all of the modules to import, making a little effort to prune // non-leaf modules. while (!Stack.empty()) { clang::Module *Mod = Stack.pop_back_val(); bool AnyChildren = false; // Visit the submodules of this module. for (const auto &SM : Mod->submodules()) { // Skip explicit children; they need to be explicitly imported to be // linked against. if (SM->IsExplicit) continue; if (Visited.insert(SM).second) { Stack.push_back(SM); AnyChildren = true; } } // We didn't find any children, so add this module to the list of // modules to link against. if (!AnyChildren) { LinkModules.insert(Mod); } } // Add link options for all of the imported modules in reverse topological // order. We don't do anything to try to order import link flags with respect // to linker options inserted by things like #pragma comment(). SmallVector MetadataArgs; Visited.clear(); for (Module *M : LinkModules) if (Visited.insert(M).second) addLinkOptionsPostorder(*this, M, MetadataArgs, Visited); std::reverse(MetadataArgs.begin(), MetadataArgs.end()); LinkerOptionsMetadata.append(MetadataArgs.begin(), MetadataArgs.end()); // Add the linker options metadata flag. auto *NMD = getModule().getOrInsertNamedMetadata("llvm.linker.options"); for (auto *MD : LinkerOptionsMetadata) NMD->addOperand(MD); } void CodeGenModule::EmitDeferred() { // Emit deferred declare target declarations. if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd) getOpenMPRuntime().emitDeferredTargetDecls(); // Emit code for any potentially referenced deferred decls. Since a // previously unused static decl may become used during the generation of code // for a static function, iterate until no changes are made. if (!DeferredVTables.empty()) { EmitDeferredVTables(); // Emitting a vtable doesn't directly cause more vtables to // become deferred, although it can cause functions to be // emitted that then need those vtables. assert(DeferredVTables.empty()); } // Emit CUDA/HIP static device variables referenced by host code only. // Note we should not clear CUDADeviceVarODRUsedByHost since it is still // needed for further handling. if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) llvm::append_range(DeferredDeclsToEmit, getContext().CUDADeviceVarODRUsedByHost); // Stop if we're out of both deferred vtables and deferred declarations. if (DeferredDeclsToEmit.empty()) return; // Grab the list of decls to emit. If EmitGlobalDefinition schedules more // work, it will not interfere with this. std::vector CurDeclsToEmit; CurDeclsToEmit.swap(DeferredDeclsToEmit); for (GlobalDecl &D : CurDeclsToEmit) { // We should call GetAddrOfGlobal with IsForDefinition set to true in order // to get GlobalValue with exactly the type we need, not something that // might had been created for another decl with the same mangled name but // different type. llvm::GlobalValue *GV = dyn_cast( GetAddrOfGlobal(D, ForDefinition)); // In case of different address spaces, we may still get a cast, even with // IsForDefinition equal to true. Query mangled names table to get // GlobalValue. if (!GV) GV = GetGlobalValue(getMangledName(D)); // Make sure GetGlobalValue returned non-null. assert(GV); // Check to see if we've already emitted this. This is necessary // for a couple of reasons: first, decls can end up in the // deferred-decls queue multiple times, and second, decls can end // up with definitions in unusual ways (e.g. by an extern inline // function acquiring a strong function redefinition). Just // ignore these cases. if (!GV->isDeclaration()) continue; // If this is OpenMP, check if it is legal to emit this global normally. if (LangOpts.OpenMP && OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(D)) continue; // Otherwise, emit the definition and move on to the next one. EmitGlobalDefinition(D, GV); // If we found out that we need to emit more decls, do that recursively. // This has the advantage that the decls are emitted in a DFS and related // ones are close together, which is convenient for testing. if (!DeferredVTables.empty() || !DeferredDeclsToEmit.empty()) { EmitDeferred(); assert(DeferredVTables.empty() && DeferredDeclsToEmit.empty()); } } } void CodeGenModule::EmitVTablesOpportunistically() { // Try to emit external vtables as available_externally if they have emitted // all inlined virtual functions. It runs after EmitDeferred() and therefore // is not allowed to create new references to things that need to be emitted // lazily. Note that it also uses fact that we eagerly emitting RTTI. assert((OpportunisticVTables.empty() || shouldOpportunisticallyEmitVTables()) && "Only emit opportunistic vtables with optimizations"); for (const CXXRecordDecl *RD : OpportunisticVTables) { assert(getVTables().isVTableExternal(RD) && "This queue should only contain external vtables"); if (getCXXABI().canSpeculativelyEmitVTable(RD)) VTables.GenerateClassData(RD); } OpportunisticVTables.clear(); } void CodeGenModule::EmitGlobalAnnotations() { if (Annotations.empty()) return; // Create a new global variable for the ConstantStruct in the Module. llvm::Constant *Array = llvm::ConstantArray::get(llvm::ArrayType::get( Annotations[0]->getType(), Annotations.size()), Annotations); auto *gv = new llvm::GlobalVariable(getModule(), Array->getType(), false, llvm::GlobalValue::AppendingLinkage, Array, "llvm.global.annotations"); gv->setSection(AnnotationSection); } llvm::Constant *CodeGenModule::EmitAnnotationString(StringRef Str) { llvm::Constant *&AStr = AnnotationStrings[Str]; if (AStr) return AStr; // Not found yet, create a new global. llvm::Constant *s = llvm::ConstantDataArray::getString(getLLVMContext(), Str); auto *gv = new llvm::GlobalVariable( getModule(), s->getType(), true, llvm::GlobalValue::PrivateLinkage, s, ".str", nullptr, llvm::GlobalValue::NotThreadLocal, ConstGlobalsPtrTy->getAddressSpace()); gv->setSection(AnnotationSection); gv->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); AStr = gv; return gv; } llvm::Constant *CodeGenModule::EmitAnnotationUnit(SourceLocation Loc) { SourceManager &SM = getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(Loc); if (PLoc.isValid()) return EmitAnnotationString(PLoc.getFilename()); return EmitAnnotationString(SM.getBufferName(Loc)); } llvm::Constant *CodeGenModule::EmitAnnotationLineNo(SourceLocation L) { SourceManager &SM = getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(L); unsigned LineNo = PLoc.isValid() ? PLoc.getLine() : SM.getExpansionLineNumber(L); return llvm::ConstantInt::get(Int32Ty, LineNo); } llvm::Constant *CodeGenModule::EmitAnnotationArgs(const AnnotateAttr *Attr) { ArrayRef Exprs = {Attr->args_begin(), Attr->args_size()}; if (Exprs.empty()) return llvm::ConstantPointerNull::get(ConstGlobalsPtrTy); llvm::FoldingSetNodeID ID; for (Expr *E : Exprs) { ID.Add(cast(E)->getAPValueResult()); } llvm::Constant *&Lookup = AnnotationArgs[ID.ComputeHash()]; if (Lookup) return Lookup; llvm::SmallVector LLVMArgs; LLVMArgs.reserve(Exprs.size()); ConstantEmitter ConstEmiter(*this); llvm::transform(Exprs, std::back_inserter(LLVMArgs), [&](const Expr *E) { const auto *CE = cast(E); return ConstEmiter.emitAbstract(CE->getBeginLoc(), CE->getAPValueResult(), CE->getType()); }); auto *Struct = llvm::ConstantStruct::getAnon(LLVMArgs); auto *GV = new llvm::GlobalVariable(getModule(), Struct->getType(), true, llvm::GlobalValue::PrivateLinkage, Struct, ".args"); GV->setSection(AnnotationSection); GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); auto *Bitcasted = llvm::ConstantExpr::getBitCast(GV, GlobalsInt8PtrTy); Lookup = Bitcasted; return Bitcasted; } llvm::Constant *CodeGenModule::EmitAnnotateAttr(llvm::GlobalValue *GV, const AnnotateAttr *AA, SourceLocation L) { // Get the globals for file name, annotation, and the line number. llvm::Constant *AnnoGV = EmitAnnotationString(AA->getAnnotation()), *UnitGV = EmitAnnotationUnit(L), *LineNoCst = EmitAnnotationLineNo(L), *Args = EmitAnnotationArgs(AA); llvm::Constant *GVInGlobalsAS = GV; if (GV->getAddressSpace() != getDataLayout().getDefaultGlobalsAddressSpace()) { GVInGlobalsAS = llvm::ConstantExpr::getAddrSpaceCast( GV, GV->getValueType()->getPointerTo( getDataLayout().getDefaultGlobalsAddressSpace())); } // Create the ConstantStruct for the global annotation. llvm::Constant *Fields[] = { llvm::ConstantExpr::getBitCast(GVInGlobalsAS, GlobalsInt8PtrTy), llvm::ConstantExpr::getBitCast(AnnoGV, ConstGlobalsPtrTy), llvm::ConstantExpr::getBitCast(UnitGV, ConstGlobalsPtrTy), LineNoCst, Args, }; return llvm::ConstantStruct::getAnon(Fields); } void CodeGenModule::AddGlobalAnnotations(const ValueDecl *D, llvm::GlobalValue *GV) { assert(D->hasAttr() && "no annotate attribute"); // Get the struct elements for these annotations. for (const auto *I : D->specific_attrs()) Annotations.push_back(EmitAnnotateAttr(GV, I, D->getLocation())); } bool CodeGenModule::isInNoSanitizeList(SanitizerMask Kind, llvm::Function *Fn, SourceLocation Loc) const { const auto &NoSanitizeL = getContext().getNoSanitizeList(); // NoSanitize by function name. if (NoSanitizeL.containsFunction(Kind, Fn->getName())) return true; // NoSanitize by location. Check "mainfile" prefix. auto &SM = Context.getSourceManager(); const FileEntry &MainFile = *SM.getFileEntryForID(SM.getMainFileID()); if (NoSanitizeL.containsMainFile(Kind, MainFile.getName())) return true; // Check "src" prefix. if (Loc.isValid()) return NoSanitizeL.containsLocation(Kind, Loc); // If location is unknown, this may be a compiler-generated function. Assume // it's located in the main file. return NoSanitizeL.containsFile(Kind, MainFile.getName()); } bool CodeGenModule::isInNoSanitizeList(SanitizerMask Kind, llvm::GlobalVariable *GV, SourceLocation Loc, QualType Ty, StringRef Category) const { const auto &NoSanitizeL = getContext().getNoSanitizeList(); if (NoSanitizeL.containsGlobal(Kind, GV->getName(), Category)) return true; auto &SM = Context.getSourceManager(); if (NoSanitizeL.containsMainFile( Kind, SM.getFileEntryForID(SM.getMainFileID())->getName(), Category)) return true; if (NoSanitizeL.containsLocation(Kind, Loc, Category)) return true; // Check global type. if (!Ty.isNull()) { // Drill down the array types: if global variable of a fixed type is // not sanitized, we also don't instrument arrays of them. while (auto AT = dyn_cast(Ty.getTypePtr())) Ty = AT->getElementType(); Ty = Ty.getCanonicalType().getUnqualifiedType(); // Only record types (classes, structs etc.) are ignored. if (Ty->isRecordType()) { std::string TypeStr = Ty.getAsString(getContext().getPrintingPolicy()); if (NoSanitizeL.containsType(Kind, TypeStr, Category)) return true; } } return false; } bool CodeGenModule::imbueXRayAttrs(llvm::Function *Fn, SourceLocation Loc, StringRef Category) const { const auto &XRayFilter = getContext().getXRayFilter(); using ImbueAttr = XRayFunctionFilter::ImbueAttribute; auto Attr = ImbueAttr::NONE; if (Loc.isValid()) Attr = XRayFilter.shouldImbueLocation(Loc, Category); if (Attr == ImbueAttr::NONE) Attr = XRayFilter.shouldImbueFunction(Fn->getName()); switch (Attr) { case ImbueAttr::NONE: return false; case ImbueAttr::ALWAYS: Fn->addFnAttr("function-instrument", "xray-always"); break; case ImbueAttr::ALWAYS_ARG1: Fn->addFnAttr("function-instrument", "xray-always"); Fn->addFnAttr("xray-log-args", "1"); break; case ImbueAttr::NEVER: Fn->addFnAttr("function-instrument", "xray-never"); break; } return true; } ProfileList::ExclusionType CodeGenModule::isFunctionBlockedByProfileList(llvm::Function *Fn, SourceLocation Loc) const { const auto &ProfileList = getContext().getProfileList(); // If the profile list is empty, then instrument everything. if (ProfileList.isEmpty()) return ProfileList::Allow; CodeGenOptions::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr(); // First, check the function name. if (auto V = ProfileList.isFunctionExcluded(Fn->getName(), Kind)) return *V; // Next, check the source location. if (Loc.isValid()) if (auto V = ProfileList.isLocationExcluded(Loc, Kind)) return *V; // If location is unknown, this may be a compiler-generated function. Assume // it's located in the main file. auto &SM = Context.getSourceManager(); if (const auto *MainFile = SM.getFileEntryForID(SM.getMainFileID())) if (auto V = ProfileList.isFileExcluded(MainFile->getName(), Kind)) return *V; return ProfileList.getDefault(Kind); } ProfileList::ExclusionType CodeGenModule::isFunctionBlockedFromProfileInstr(llvm::Function *Fn, SourceLocation Loc) const { auto V = isFunctionBlockedByProfileList(Fn, Loc); if (V != ProfileList::Allow) return V; auto NumGroups = getCodeGenOpts().ProfileTotalFunctionGroups; if (NumGroups > 1) { auto Group = llvm::crc32(arrayRefFromStringRef(Fn->getName())) % NumGroups; if (Group != getCodeGenOpts().ProfileSelectedFunctionGroup) return ProfileList::Skip; } return ProfileList::Allow; } bool CodeGenModule::MustBeEmitted(const ValueDecl *Global) { // Never defer when EmitAllDecls is specified. if (LangOpts.EmitAllDecls) return true; const auto *VD = dyn_cast(Global); if (VD && ((CodeGenOpts.KeepPersistentStorageVariables && (VD->getStorageDuration() == SD_Static || VD->getStorageDuration() == SD_Thread)) || (CodeGenOpts.KeepStaticConsts && VD->getStorageDuration() == SD_Static && VD->getType().isConstQualified()))) return true; return getContext().DeclMustBeEmitted(Global); } bool CodeGenModule::MayBeEmittedEagerly(const ValueDecl *Global) { // In OpenMP 5.0 variables and function may be marked as // device_type(host/nohost) and we should not emit them eagerly unless we sure // that they must be emitted on the host/device. To be sure we need to have // seen a declare target with an explicit mentioning of the function, we know // we have if the level of the declare target attribute is -1. Note that we // check somewhere else if we should emit this at all. if (LangOpts.OpenMP >= 50 && !LangOpts.OpenMPSimd) { std::optional ActiveAttr = OMPDeclareTargetDeclAttr::getActiveAttr(Global); if (!ActiveAttr || (*ActiveAttr)->getLevel() != (unsigned)-1) return false; } if (const auto *FD = dyn_cast(Global)) { if (FD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation) // Implicit template instantiations may change linkage if they are later // explicitly instantiated, so they should not be emitted eagerly. return false; } if (const auto *VD = dyn_cast(Global)) { if (Context.getInlineVariableDefinitionKind(VD) == ASTContext::InlineVariableDefinitionKind::WeakUnknown) // A definition of an inline constexpr static data member may change // linkage later if it's redeclared outside the class. return false; if (CXX20ModuleInits && VD->getOwningModule() && !VD->getOwningModule()->isModuleMapModule()) { // For CXX20, module-owned initializers need to be deferred, since it is // not known at this point if they will be run for the current module or // as part of the initializer for an imported one. return false; } } // If OpenMP is enabled and threadprivates must be generated like TLS, delay // codegen for global variables, because they may be marked as threadprivate. if (LangOpts.OpenMP && LangOpts.OpenMPUseTLS && getContext().getTargetInfo().isTLSSupported() && isa(Global) && !isTypeConstant(Global->getType(), false, false) && !OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(Global)) return false; return true; } ConstantAddress CodeGenModule::GetAddrOfMSGuidDecl(const MSGuidDecl *GD) { StringRef Name = getMangledName(GD); // The UUID descriptor should be pointer aligned. CharUnits Alignment = CharUnits::fromQuantity(PointerAlignInBytes); // Look for an existing global. if (llvm::GlobalVariable *GV = getModule().getNamedGlobal(Name)) return ConstantAddress(GV, GV->getValueType(), Alignment); ConstantEmitter Emitter(*this); llvm::Constant *Init; APValue &V = GD->getAsAPValue(); if (!V.isAbsent()) { // If possible, emit the APValue version of the initializer. In particular, // this gets the type of the constant right. Init = Emitter.emitForInitializer( GD->getAsAPValue(), GD->getType().getAddressSpace(), GD->getType()); } else { // As a fallback, directly construct the constant. // FIXME: This may get padding wrong under esoteric struct layout rules. // MSVC appears to create a complete type 'struct __s_GUID' that it // presumably uses to represent these constants. MSGuidDecl::Parts Parts = GD->getParts(); llvm::Constant *Fields[4] = { llvm::ConstantInt::get(Int32Ty, Parts.Part1), llvm::ConstantInt::get(Int16Ty, Parts.Part2), llvm::ConstantInt::get(Int16Ty, Parts.Part3), llvm::ConstantDataArray::getRaw( StringRef(reinterpret_cast(Parts.Part4And5), 8), 8, Int8Ty)}; Init = llvm::ConstantStruct::getAnon(Fields); } auto *GV = new llvm::GlobalVariable( getModule(), Init->getType(), /*isConstant=*/true, llvm::GlobalValue::LinkOnceODRLinkage, Init, Name); if (supportsCOMDAT()) GV->setComdat(TheModule.getOrInsertComdat(GV->getName())); setDSOLocal(GV); if (!V.isAbsent()) { Emitter.finalize(GV); return ConstantAddress(GV, GV->getValueType(), Alignment); } llvm::Type *Ty = getTypes().ConvertTypeForMem(GD->getType()); llvm::Constant *Addr = llvm::ConstantExpr::getBitCast( GV, Ty->getPointerTo(GV->getAddressSpace())); return ConstantAddress(Addr, Ty, Alignment); } ConstantAddress CodeGenModule::GetAddrOfUnnamedGlobalConstantDecl( const UnnamedGlobalConstantDecl *GCD) { CharUnits Alignment = getContext().getTypeAlignInChars(GCD->getType()); llvm::GlobalVariable **Entry = nullptr; Entry = &UnnamedGlobalConstantDeclMap[GCD]; if (*Entry) return ConstantAddress(*Entry, (*Entry)->getValueType(), Alignment); ConstantEmitter Emitter(*this); llvm::Constant *Init; const APValue &V = GCD->getValue(); assert(!V.isAbsent()); Init = Emitter.emitForInitializer(V, GCD->getType().getAddressSpace(), GCD->getType()); auto *GV = new llvm::GlobalVariable(getModule(), Init->getType(), /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, Init, ".constant"); GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); GV->setAlignment(Alignment.getAsAlign()); Emitter.finalize(GV); *Entry = GV; return ConstantAddress(GV, GV->getValueType(), Alignment); } ConstantAddress CodeGenModule::GetAddrOfTemplateParamObject( const TemplateParamObjectDecl *TPO) { StringRef Name = getMangledName(TPO); CharUnits Alignment = getNaturalTypeAlignment(TPO->getType()); if (llvm::GlobalVariable *GV = getModule().getNamedGlobal(Name)) return ConstantAddress(GV, GV->getValueType(), Alignment); ConstantEmitter Emitter(*this); llvm::Constant *Init = Emitter.emitForInitializer( TPO->getValue(), TPO->getType().getAddressSpace(), TPO->getType()); if (!Init) { ErrorUnsupported(TPO, "template parameter object"); return ConstantAddress::invalid(); } llvm::GlobalValue::LinkageTypes Linkage = isExternallyVisible(TPO->getLinkageAndVisibility().getLinkage()) ? llvm::GlobalValue::LinkOnceODRLinkage : llvm::GlobalValue::InternalLinkage; auto *GV = new llvm::GlobalVariable(getModule(), Init->getType(), /*isConstant=*/true, Linkage, Init, Name); setGVProperties(GV, TPO); if (supportsCOMDAT()) GV->setComdat(TheModule.getOrInsertComdat(GV->getName())); Emitter.finalize(GV); return ConstantAddress(GV, GV->getValueType(), Alignment); } ConstantAddress CodeGenModule::GetWeakRefReference(const ValueDecl *VD) { const AliasAttr *AA = VD->getAttr(); assert(AA && "No alias?"); CharUnits Alignment = getContext().getDeclAlign(VD); llvm::Type *DeclTy = getTypes().ConvertTypeForMem(VD->getType()); // See if there is already something with the target's name in the module. llvm::GlobalValue *Entry = GetGlobalValue(AA->getAliasee()); if (Entry) { unsigned AS = getTypes().getTargetAddressSpace(VD->getType()); auto Ptr = llvm::ConstantExpr::getBitCast(Entry, DeclTy->getPointerTo(AS)); return ConstantAddress(Ptr, DeclTy, Alignment); } llvm::Constant *Aliasee; if (isa(DeclTy)) Aliasee = GetOrCreateLLVMFunction(AA->getAliasee(), DeclTy, GlobalDecl(cast(VD)), /*ForVTable=*/false); else Aliasee = GetOrCreateLLVMGlobal(AA->getAliasee(), DeclTy, LangAS::Default, nullptr); auto *F = cast(Aliasee); F->setLinkage(llvm::Function::ExternalWeakLinkage); WeakRefReferences.insert(F); return ConstantAddress(Aliasee, DeclTy, Alignment); } void CodeGenModule::EmitGlobal(GlobalDecl GD) { const auto *Global = cast(GD.getDecl()); // Weak references don't produce any output by themselves. if (Global->hasAttr()) return; // If this is an alias definition (which otherwise looks like a declaration) // emit it now. if (Global->hasAttr()) return EmitAliasDefinition(GD); // IFunc like an alias whose value is resolved at runtime by calling resolver. if (Global->hasAttr()) return emitIFuncDefinition(GD); // If this is a cpu_dispatch multiversion function, emit the resolver. if (Global->hasAttr()) return emitCPUDispatchDefinition(GD); // If this is CUDA, be selective about which declarations we emit. if (LangOpts.CUDA) { if (LangOpts.CUDAIsDevice) { if (!Global->hasAttr() && !Global->hasAttr() && !Global->hasAttr() && !Global->hasAttr() && !Global->getType()->isCUDADeviceBuiltinSurfaceType() && !Global->getType()->isCUDADeviceBuiltinTextureType()) return; } else { // We need to emit host-side 'shadows' for all global // device-side variables because the CUDA runtime needs their // size and host-side address in order to provide access to // their device-side incarnations. // So device-only functions are the only things we skip. if (isa(Global) && !Global->hasAttr() && Global->hasAttr()) return; assert((isa(Global) || isa(Global)) && "Expected Variable or Function"); } } if (LangOpts.OpenMP) { // If this is OpenMP, check if it is legal to emit this global normally. if (OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(GD)) return; if (auto *DRD = dyn_cast(Global)) { if (MustBeEmitted(Global)) EmitOMPDeclareReduction(DRD); return; } if (auto *DMD = dyn_cast(Global)) { if (MustBeEmitted(Global)) EmitOMPDeclareMapper(DMD); return; } } // Ignore declarations, they will be emitted on their first use. if (const auto *FD = dyn_cast(Global)) { // Forward declarations are emitted lazily on first use. if (!FD->doesThisDeclarationHaveABody()) { if (!FD->doesDeclarationForceExternallyVisibleDefinition()) return; StringRef MangledName = getMangledName(GD); // Compute the function info and LLVM type. const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::Type *Ty = getTypes().GetFunctionType(FI); GetOrCreateLLVMFunction(MangledName, Ty, GD, /*ForVTable=*/false, /*DontDefer=*/false); return; } } else { const auto *VD = cast(Global); assert(VD->isFileVarDecl() && "Cannot emit local var decl as global."); if (VD->isThisDeclarationADefinition() != VarDecl::Definition && !Context.isMSStaticDataMemberInlineDefinition(VD)) { if (LangOpts.OpenMP) { // Emit declaration of the must-be-emitted declare target variable. if (std::optional Res = OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) { // If this variable has external storage and doesn't require special // link handling we defer to its canonical definition. if (VD->hasExternalStorage() && Res != OMPDeclareTargetDeclAttr::MT_Link) return; bool UnifiedMemoryEnabled = getOpenMPRuntime().hasRequiresUnifiedSharedMemory(); if ((*Res == OMPDeclareTargetDeclAttr::MT_To || *Res == OMPDeclareTargetDeclAttr::MT_Enter) && !UnifiedMemoryEnabled) { (void)GetAddrOfGlobalVar(VD); } else { assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) || ((*Res == OMPDeclareTargetDeclAttr::MT_To || *Res == OMPDeclareTargetDeclAttr::MT_Enter) && UnifiedMemoryEnabled)) && "Link clause or to clause with unified memory expected."); (void)getOpenMPRuntime().getAddrOfDeclareTargetVar(VD); } return; } } // If this declaration may have caused an inline variable definition to // change linkage, make sure that it's emitted. if (Context.getInlineVariableDefinitionKind(VD) == ASTContext::InlineVariableDefinitionKind::Strong) GetAddrOfGlobalVar(VD); return; } } // Defer code generation to first use when possible, e.g. if this is an inline // function. If the global must always be emitted, do it eagerly if possible // to benefit from cache locality. if (MustBeEmitted(Global) && MayBeEmittedEagerly(Global)) { // Emit the definition if it can't be deferred. EmitGlobalDefinition(GD); addEmittedDeferredDecl(GD); return; } // If we're deferring emission of a C++ variable with an // initializer, remember the order in which it appeared in the file. if (getLangOpts().CPlusPlus && isa(Global) && cast(Global)->hasInit()) { DelayedCXXInitPosition[Global] = CXXGlobalInits.size(); CXXGlobalInits.push_back(nullptr); } StringRef MangledName = getMangledName(GD); if (GetGlobalValue(MangledName) != nullptr) { // The value has already been used and should therefore be emitted. addDeferredDeclToEmit(GD); } else if (MustBeEmitted(Global)) { // The value must be emitted, but cannot be emitted eagerly. assert(!MayBeEmittedEagerly(Global)); addDeferredDeclToEmit(GD); } else { // Otherwise, remember that we saw a deferred decl with this name. The // first use of the mangled name will cause it to move into // DeferredDeclsToEmit. DeferredDecls[MangledName] = GD; } } // Check if T is a class type with a destructor that's not dllimport. static bool HasNonDllImportDtor(QualType T) { if (const auto *RT = T->getBaseElementTypeUnsafe()->getAs()) if (CXXRecordDecl *RD = dyn_cast(RT->getDecl())) if (RD->getDestructor() && !RD->getDestructor()->hasAttr()) return true; return false; } namespace { struct FunctionIsDirectlyRecursive : public ConstStmtVisitor { const StringRef Name; const Builtin::Context &BI; FunctionIsDirectlyRecursive(StringRef N, const Builtin::Context &C) : Name(N), BI(C) {} bool VisitCallExpr(const CallExpr *E) { const FunctionDecl *FD = E->getDirectCallee(); if (!FD) return false; AsmLabelAttr *Attr = FD->getAttr(); if (Attr && Name == Attr->getLabel()) return true; unsigned BuiltinID = FD->getBuiltinID(); if (!BuiltinID || !BI.isLibFunction(BuiltinID)) return false; StringRef BuiltinName = BI.getName(BuiltinID); if (BuiltinName.startswith("__builtin_") && Name == BuiltinName.slice(strlen("__builtin_"), StringRef::npos)) { return true; } return false; } bool VisitStmt(const Stmt *S) { for (const Stmt *Child : S->children()) if (Child && this->Visit(Child)) return true; return false; } }; // Make sure we're not referencing non-imported vars or functions. struct DLLImportFunctionVisitor : public RecursiveASTVisitor { bool SafeToInline = true; bool shouldVisitImplicitCode() const { return true; } bool VisitVarDecl(VarDecl *VD) { if (VD->getTLSKind()) { // A thread-local variable cannot be imported. SafeToInline = false; return SafeToInline; } // A variable definition might imply a destructor call. if (VD->isThisDeclarationADefinition()) SafeToInline = !HasNonDllImportDtor(VD->getType()); return SafeToInline; } bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) { if (const auto *D = E->getTemporary()->getDestructor()) SafeToInline = D->hasAttr(); return SafeToInline; } bool VisitDeclRefExpr(DeclRefExpr *E) { ValueDecl *VD = E->getDecl(); if (isa(VD)) SafeToInline = VD->hasAttr(); else if (VarDecl *V = dyn_cast(VD)) SafeToInline = !V->hasGlobalStorage() || V->hasAttr(); return SafeToInline; } bool VisitCXXConstructExpr(CXXConstructExpr *E) { SafeToInline = E->getConstructor()->hasAttr(); return SafeToInline; } bool VisitCXXMemberCallExpr(CXXMemberCallExpr *E) { CXXMethodDecl *M = E->getMethodDecl(); if (!M) { // Call through a pointer to member function. This is safe to inline. SafeToInline = true; } else { SafeToInline = M->hasAttr(); } return SafeToInline; } bool VisitCXXDeleteExpr(CXXDeleteExpr *E) { SafeToInline = E->getOperatorDelete()->hasAttr(); return SafeToInline; } bool VisitCXXNewExpr(CXXNewExpr *E) { SafeToInline = E->getOperatorNew()->hasAttr(); return SafeToInline; } }; } // isTriviallyRecursive - Check if this function calls another // decl that, because of the asm attribute or the other decl being a builtin, // ends up pointing to itself. bool CodeGenModule::isTriviallyRecursive(const FunctionDecl *FD) { StringRef Name; if (getCXXABI().getMangleContext().shouldMangleDeclName(FD)) { // asm labels are a special kind of mangling we have to support. AsmLabelAttr *Attr = FD->getAttr(); if (!Attr) return false; Name = Attr->getLabel(); } else { Name = FD->getName(); } FunctionIsDirectlyRecursive Walker(Name, Context.BuiltinInfo); const Stmt *Body = FD->getBody(); return Body ? Walker.Visit(Body) : false; } bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) { if (getFunctionLinkage(GD) != llvm::Function::AvailableExternallyLinkage) return true; const auto *F = cast(GD.getDecl()); if (CodeGenOpts.OptimizationLevel == 0 && !F->hasAttr()) return false; if (F->hasAttr() && !F->hasAttr()) { // Check whether it would be safe to inline this dllimport function. DLLImportFunctionVisitor Visitor; Visitor.TraverseFunctionDecl(const_cast(F)); if (!Visitor.SafeToInline) return false; if (const CXXDestructorDecl *Dtor = dyn_cast(F)) { // Implicit destructor invocations aren't captured in the AST, so the // check above can't see them. Check for them manually here. for (const Decl *Member : Dtor->getParent()->decls()) if (isa(Member)) if (HasNonDllImportDtor(cast(Member)->getType())) return false; for (const CXXBaseSpecifier &B : Dtor->getParent()->bases()) if (HasNonDllImportDtor(B.getType())) return false; } } // Inline builtins declaration must be emitted. They often are fortified // functions. if (F->isInlineBuiltinDeclaration()) return true; // PR9614. Avoid cases where the source code is lying to us. An available // externally function should have an equivalent function somewhere else, // but a function that calls itself through asm label/`__builtin_` trickery is // clearly not equivalent to the real implementation. // This happens in glibc's btowc and in some configure checks. return !isTriviallyRecursive(F); } bool CodeGenModule::shouldOpportunisticallyEmitVTables() { return CodeGenOpts.OptimizationLevel > 0; } void CodeGenModule::EmitMultiVersionFunctionDefinition(GlobalDecl GD, llvm::GlobalValue *GV) { const auto *FD = cast(GD.getDecl()); if (FD->isCPUSpecificMultiVersion()) { auto *Spec = FD->getAttr(); for (unsigned I = 0; I < Spec->cpus_size(); ++I) EmitGlobalFunctionDefinition(GD.getWithMultiVersionIndex(I), nullptr); } else if (FD->isTargetClonesMultiVersion()) { auto *Clone = FD->getAttr(); for (unsigned I = 0; I < Clone->featuresStrs_size(); ++I) if (Clone->isFirstOfVersion(I)) EmitGlobalFunctionDefinition(GD.getWithMultiVersionIndex(I), nullptr); // Ensure that the resolver function is also emitted. GetOrCreateMultiVersionResolver(GD); } else EmitGlobalFunctionDefinition(GD, GV); } void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) { const auto *D = cast(GD.getDecl()); PrettyStackTraceDecl CrashInfo(const_cast(D), D->getLocation(), Context.getSourceManager(), "Generating code for declaration"); if (const auto *FD = dyn_cast(D)) { // At -O0, don't generate IR for functions with available_externally // linkage. if (!shouldEmitFunction(GD)) return; llvm::TimeTraceScope TimeScope("CodeGen Function", [&]() { std::string Name; llvm::raw_string_ostream OS(Name); FD->getNameForDiagnostic(OS, getContext().getPrintingPolicy(), /*Qualified=*/true); return Name; }); if (const auto *Method = dyn_cast(D)) { // Make sure to emit the definition(s) before we emit the thunks. // This is necessary for the generation of certain thunks. if (isa(Method) || isa(Method)) ABI->emitCXXStructor(GD); else if (FD->isMultiVersion()) EmitMultiVersionFunctionDefinition(GD, GV); else EmitGlobalFunctionDefinition(GD, GV); if (Method->isVirtual()) getVTables().EmitThunks(GD); return; } if (FD->isMultiVersion()) return EmitMultiVersionFunctionDefinition(GD, GV); return EmitGlobalFunctionDefinition(GD, GV); } if (const auto *VD = dyn_cast(D)) return EmitGlobalVarDefinition(VD, !VD->hasDefinition()); llvm_unreachable("Invalid argument to EmitGlobalDefinition()"); } static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old, llvm::Function *NewFn); static unsigned TargetMVPriority(const TargetInfo &TI, const CodeGenFunction::MultiVersionResolverOption &RO) { unsigned Priority = 0; unsigned NumFeatures = 0; for (StringRef Feat : RO.Conditions.Features) { Priority = std::max(Priority, TI.multiVersionSortPriority(Feat)); NumFeatures++; } if (!RO.Conditions.Architecture.empty()) Priority = std::max( Priority, TI.multiVersionSortPriority(RO.Conditions.Architecture)); Priority += TI.multiVersionFeatureCost() * NumFeatures; return Priority; } // Multiversion functions should be at most 'WeakODRLinkage' so that a different // TU can forward declare the function without causing problems. Particularly // in the cases of CPUDispatch, this causes issues. This also makes sure we // work with internal linkage functions, so that the same function name can be // used with internal linkage in multiple TUs. llvm::GlobalValue::LinkageTypes getMultiversionLinkage(CodeGenModule &CGM, GlobalDecl GD) { const FunctionDecl *FD = cast(GD.getDecl()); if (FD->getFormalLinkage() == InternalLinkage) return llvm::GlobalValue::InternalLinkage; return llvm::GlobalValue::WeakODRLinkage; } void CodeGenModule::emitMultiVersionFunctions() { std::vector MVFuncsToEmit; MultiVersionFuncs.swap(MVFuncsToEmit); for (GlobalDecl GD : MVFuncsToEmit) { const auto *FD = cast(GD.getDecl()); assert(FD && "Expected a FunctionDecl"); SmallVector Options; if (FD->isTargetMultiVersion()) { getContext().forEachMultiversionedFunctionVersion( FD, [this, &GD, &Options](const FunctionDecl *CurFD) { GlobalDecl CurGD{ (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)}; StringRef MangledName = getMangledName(CurGD); llvm::Constant *Func = GetGlobalValue(MangledName); if (!Func) { if (CurFD->isDefined()) { EmitGlobalFunctionDefinition(CurGD, nullptr); Func = GetGlobalValue(MangledName); } else { const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::FunctionType *Ty = getTypes().GetFunctionType(FI); Func = GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false, /*DontDefer=*/false, ForDefinition); } assert(Func && "This should have just been created"); } if (CurFD->getMultiVersionKind() == MultiVersionKind::Target) { const auto *TA = CurFD->getAttr(); llvm::SmallVector Feats; TA->getAddedFeatures(Feats); Options.emplace_back(cast(Func), TA->getArchitecture(), Feats); } else { const auto *TVA = CurFD->getAttr(); llvm::SmallVector Feats; TVA->getFeatures(Feats); Options.emplace_back(cast(Func), /*Architecture*/ "", Feats); } }); } else if (FD->isTargetClonesMultiVersion()) { const auto *TC = FD->getAttr(); for (unsigned VersionIndex = 0; VersionIndex < TC->featuresStrs_size(); ++VersionIndex) { if (!TC->isFirstOfVersion(VersionIndex)) continue; GlobalDecl CurGD{(FD->isDefined() ? FD->getDefinition() : FD), VersionIndex}; StringRef Version = TC->getFeatureStr(VersionIndex); StringRef MangledName = getMangledName(CurGD); llvm::Constant *Func = GetGlobalValue(MangledName); if (!Func) { if (FD->isDefined()) { EmitGlobalFunctionDefinition(CurGD, nullptr); Func = GetGlobalValue(MangledName); } else { const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(CurGD); llvm::FunctionType *Ty = getTypes().GetFunctionType(FI); Func = GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false, /*DontDefer=*/false, ForDefinition); } assert(Func && "This should have just been created"); } StringRef Architecture; llvm::SmallVector Feature; if (getTarget().getTriple().isAArch64()) { if (Version != "default") { llvm::SmallVector VerFeats; Version.split(VerFeats, "+"); for (auto &CurFeat : VerFeats) Feature.push_back(CurFeat.trim()); } } else { if (Version.startswith("arch=")) Architecture = Version.drop_front(sizeof("arch=") - 1); else if (Version != "default") Feature.push_back(Version); } Options.emplace_back(cast(Func), Architecture, Feature); } } else { assert(0 && "Expected a target or target_clones multiversion function"); continue; } llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD); if (auto *IFunc = dyn_cast(ResolverConstant)) ResolverConstant = IFunc->getResolver(); llvm::Function *ResolverFunc = cast(ResolverConstant); ResolverFunc->setLinkage(getMultiversionLinkage(*this, GD)); if (supportsCOMDAT()) ResolverFunc->setComdat( getModule().getOrInsertComdat(ResolverFunc->getName())); const TargetInfo &TI = getTarget(); llvm::stable_sort( Options, [&TI](const CodeGenFunction::MultiVersionResolverOption &LHS, const CodeGenFunction::MultiVersionResolverOption &RHS) { return TargetMVPriority(TI, LHS) > TargetMVPriority(TI, RHS); }); CodeGenFunction CGF(*this); CGF.EmitMultiVersionResolver(ResolverFunc, Options); } // Ensure that any additions to the deferred decls list caused by emitting a // variant are emitted. This can happen when the variant itself is inline and // calls a function without linkage. if (!MVFuncsToEmit.empty()) EmitDeferred(); // Ensure that any additions to the multiversion funcs list from either the // deferred decls or the multiversion functions themselves are emitted. if (!MultiVersionFuncs.empty()) emitMultiVersionFunctions(); } void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) { const auto *FD = cast(GD.getDecl()); assert(FD && "Not a FunctionDecl?"); assert(FD->isCPUDispatchMultiVersion() && "Not a multiversion function?"); const auto *DD = FD->getAttr(); assert(DD && "Not a cpu_dispatch Function?"); const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::FunctionType *DeclTy = getTypes().GetFunctionType(FI); StringRef ResolverName = getMangledName(GD); UpdateMultiVersionNames(GD, FD, ResolverName); llvm::Type *ResolverType; GlobalDecl ResolverGD; if (getTarget().supportsIFunc()) { ResolverType = llvm::FunctionType::get( llvm::PointerType::get(DeclTy, getTypes().getTargetAddressSpace(FD->getType())), false); } else { ResolverType = DeclTy; ResolverGD = GD; } auto *ResolverFunc = cast(GetOrCreateLLVMFunction( ResolverName, ResolverType, ResolverGD, /*ForVTable=*/false)); ResolverFunc->setLinkage(getMultiversionLinkage(*this, GD)); if (supportsCOMDAT()) ResolverFunc->setComdat( getModule().getOrInsertComdat(ResolverFunc->getName())); SmallVector Options; const TargetInfo &Target = getTarget(); unsigned Index = 0; for (const IdentifierInfo *II : DD->cpus()) { // Get the name of the target function so we can look it up/create it. std::string MangledName = getMangledNameImpl(*this, GD, FD, true) + getCPUSpecificMangling(*this, II->getName()); llvm::Constant *Func = GetGlobalValue(MangledName); if (!Func) { GlobalDecl ExistingDecl = Manglings.lookup(MangledName); if (ExistingDecl.getDecl() && ExistingDecl.getDecl()->getAsFunction()->isDefined()) { EmitGlobalFunctionDefinition(ExistingDecl, nullptr); Func = GetGlobalValue(MangledName); } else { if (!ExistingDecl.getDecl()) ExistingDecl = GD.getWithMultiVersionIndex(Index); Func = GetOrCreateLLVMFunction( MangledName, DeclTy, ExistingDecl, /*ForVTable=*/false, /*DontDefer=*/true, /*IsThunk=*/false, llvm::AttributeList(), ForDefinition); } } llvm::SmallVector Features; Target.getCPUSpecificCPUDispatchFeatures(II->getName(), Features); llvm::transform(Features, Features.begin(), [](StringRef Str) { return Str.substr(1); }); llvm::erase_if(Features, [&Target](StringRef Feat) { return !Target.validateCpuSupports(Feat); }); Options.emplace_back(cast(Func), StringRef{}, Features); ++Index; } llvm::stable_sort( Options, [](const CodeGenFunction::MultiVersionResolverOption &LHS, const CodeGenFunction::MultiVersionResolverOption &RHS) { return llvm::X86::getCpuSupportsMask(LHS.Conditions.Features) > llvm::X86::getCpuSupportsMask(RHS.Conditions.Features); }); // If the list contains multiple 'default' versions, such as when it contains // 'pentium' and 'generic', don't emit the call to the generic one (since we // always run on at least a 'pentium'). We do this by deleting the 'least // advanced' (read, lowest mangling letter). while (Options.size() > 1 && llvm::X86::getCpuSupportsMask( (Options.end() - 2)->Conditions.Features) == 0) { StringRef LHSName = (Options.end() - 2)->Function->getName(); StringRef RHSName = (Options.end() - 1)->Function->getName(); if (LHSName.compare(RHSName) < 0) Options.erase(Options.end() - 2); else Options.erase(Options.end() - 1); } CodeGenFunction CGF(*this); CGF.EmitMultiVersionResolver(ResolverFunc, Options); if (getTarget().supportsIFunc()) { llvm::GlobalValue::LinkageTypes Linkage = getMultiversionLinkage(*this, GD); auto *IFunc = cast(GetOrCreateMultiVersionResolver(GD)); // Fix up function declarations that were created for cpu_specific before // cpu_dispatch was known if (!isa(IFunc)) { assert(cast(IFunc)->isDeclaration()); auto *GI = llvm::GlobalIFunc::create(DeclTy, 0, Linkage, "", ResolverFunc, &getModule()); GI->takeName(IFunc); IFunc->replaceAllUsesWith(GI); IFunc->eraseFromParent(); IFunc = GI; } std::string AliasName = getMangledNameImpl( *this, GD, FD, /*OmitMultiVersionMangling=*/true); llvm::Constant *AliasFunc = GetGlobalValue(AliasName); if (!AliasFunc) { auto *GA = llvm::GlobalAlias::create(DeclTy, 0, Linkage, AliasName, IFunc, &getModule()); SetCommonAttributes(GD, GA); } } } /// If a dispatcher for the specified mangled name is not in the module, create /// and return an llvm Function with the specified type. llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { const auto *FD = cast(GD.getDecl()); assert(FD && "Not a FunctionDecl?"); std::string MangledName = getMangledNameImpl(*this, GD, FD, /*OmitMultiVersionMangling=*/true); // Holds the name of the resolver, in ifunc mode this is the ifunc (which has // a separate resolver). std::string ResolverName = MangledName; if (getTarget().supportsIFunc()) ResolverName += ".ifunc"; else if (FD->isTargetMultiVersion()) ResolverName += ".resolver"; // If the resolver has already been created, just return it. if (llvm::GlobalValue *ResolverGV = GetGlobalValue(ResolverName)) return ResolverGV; const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::FunctionType *DeclTy = getTypes().GetFunctionType(FI); // The resolver needs to be created. For target and target_clones, defer // creation until the end of the TU. if (FD->isTargetMultiVersion() || FD->isTargetClonesMultiVersion()) MultiVersionFuncs.push_back(GD); // For cpu_specific, don't create an ifunc yet because we don't know if the // cpu_dispatch will be emitted in this translation unit. if (getTarget().supportsIFunc() && !FD->isCPUSpecificMultiVersion()) { llvm::Type *ResolverType = llvm::FunctionType::get( llvm::PointerType::get(DeclTy, getTypes().getTargetAddressSpace(FD->getType())), false); llvm::Constant *Resolver = GetOrCreateLLVMFunction( MangledName + ".resolver", ResolverType, GlobalDecl{}, /*ForVTable=*/false); llvm::GlobalIFunc *GIF = llvm::GlobalIFunc::create(DeclTy, 0, getMultiversionLinkage(*this, GD), "", Resolver, &getModule()); GIF->setName(ResolverName); SetCommonAttributes(FD, GIF); return GIF; } llvm::Constant *Resolver = GetOrCreateLLVMFunction( ResolverName, DeclTy, GlobalDecl{}, /*ForVTable=*/false); assert(isa(Resolver) && "Resolver should be created for the first time"); SetCommonAttributes(FD, cast(Resolver)); return Resolver; } /// GetOrCreateLLVMFunction - If the specified mangled name is not in the /// module, create and return an llvm Function with the specified type. If there /// is something in the module with the specified name, return it potentially /// bitcasted to the right type. /// /// If D is non-null, it specifies a decl that correspond to this. This is used /// to set the attributes on the function when it is first created. llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction( StringRef MangledName, llvm::Type *Ty, GlobalDecl GD, bool ForVTable, bool DontDefer, bool IsThunk, llvm::AttributeList ExtraAttrs, ForDefinition_t IsForDefinition) { const Decl *D = GD.getDecl(); // Any attempts to use a MultiVersion function should result in retrieving // the iFunc instead. Name Mangling will handle the rest of the changes. if (const FunctionDecl *FD = cast_or_null(D)) { // For the device mark the function as one that should be emitted. if (getLangOpts().OpenMPIsTargetDevice && OpenMPRuntime && !OpenMPRuntime->markAsGlobalTarget(GD) && FD->isDefined() && !DontDefer && !IsForDefinition) { if (const FunctionDecl *FDDef = FD->getDefinition()) { GlobalDecl GDDef; if (const auto *CD = dyn_cast(FDDef)) GDDef = GlobalDecl(CD, GD.getCtorType()); else if (const auto *DD = dyn_cast(FDDef)) GDDef = GlobalDecl(DD, GD.getDtorType()); else GDDef = GlobalDecl(FDDef); EmitGlobal(GDDef); } } if (FD->isMultiVersion()) { UpdateMultiVersionNames(GD, FD, MangledName); if (!IsForDefinition) return GetOrCreateMultiVersionResolver(GD); } } // Lookup the entry, lazily creating it if necessary. llvm::GlobalValue *Entry = GetGlobalValue(MangledName); if (Entry) { if (WeakRefReferences.erase(Entry)) { const FunctionDecl *FD = cast_or_null(D); if (FD && !FD->hasAttr()) Entry->setLinkage(llvm::Function::ExternalLinkage); } // Handle dropped DLL attributes. if (D && !D->hasAttr() && !D->hasAttr() && !shouldMapVisibilityToDLLExport(cast_or_null(D))) { Entry->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); setDSOLocal(Entry); } // If there are two attempts to define the same mangled name, issue an // error. if (IsForDefinition && !Entry->isDeclaration()) { GlobalDecl OtherGD; // Check that GD is not yet in DiagnosedConflictingDefinitions is required // to make sure that we issue an error only once. if (lookupRepresentativeDecl(MangledName, OtherGD) && (GD.getCanonicalDecl().getDecl() != OtherGD.getCanonicalDecl().getDecl()) && DiagnosedConflictingDefinitions.insert(GD).second) { getDiags().Report(D->getLocation(), diag::err_duplicate_mangled_name) << MangledName; getDiags().Report(OtherGD.getDecl()->getLocation(), diag::note_previous_definition); } } if ((isa(Entry) || isa(Entry)) && (Entry->getValueType() == Ty)) { return Entry; } // Make sure the result is of the correct type. // (If function is requested for a definition, we always need to create a new // function, not just return a bitcast.) if (!IsForDefinition) return llvm::ConstantExpr::getBitCast( Entry, Ty->getPointerTo(Entry->getAddressSpace())); } // This function doesn't have a complete type (for example, the return // type is an incomplete struct). Use a fake type instead, and make // sure not to try to set attributes. bool IsIncompleteFunction = false; llvm::FunctionType *FTy; if (isa(Ty)) { FTy = cast(Ty); } else { FTy = llvm::FunctionType::get(VoidTy, false); IsIncompleteFunction = true; } llvm::Function *F = llvm::Function::Create(FTy, llvm::Function::ExternalLinkage, Entry ? StringRef() : MangledName, &getModule()); // If we already created a function with the same mangled name (but different // type) before, take its name and add it to the list of functions to be // replaced with F at the end of CodeGen. // // This happens if there is a prototype for a function (e.g. "int f()") and // then a definition of a different type (e.g. "int f(int x)"). if (Entry) { F->takeName(Entry); // This might be an implementation of a function without a prototype, in // which case, try to do special replacement of calls which match the new // prototype. The really key thing here is that we also potentially drop // arguments from the call site so as to make a direct call, which makes the // inliner happier and suppresses a number of optimizer warnings (!) about // dropping arguments. if (!Entry->use_empty()) { ReplaceUsesOfNonProtoTypeWithRealFunction(Entry, F); Entry->removeDeadConstantUsers(); } llvm::Constant *BC = llvm::ConstantExpr::getBitCast( F, Entry->getValueType()->getPointerTo(Entry->getAddressSpace())); addGlobalValReplacement(Entry, BC); } assert(F->getName() == MangledName && "name was uniqued!"); if (D) SetFunctionAttributes(GD, F, IsIncompleteFunction, IsThunk); if (ExtraAttrs.hasFnAttrs()) { llvm::AttrBuilder B(F->getContext(), ExtraAttrs.getFnAttrs()); F->addFnAttrs(B); } if (!DontDefer) { // All MSVC dtors other than the base dtor are linkonce_odr and delegate to // each other bottoming out with the base dtor. Therefore we emit non-base // dtors on usage, even if there is no dtor definition in the TU. if (isa_and_nonnull(D) && getCXXABI().useThunkForDtorVariant(cast(D), GD.getDtorType())) addDeferredDeclToEmit(GD); // This is the first use or definition of a mangled name. If there is a // deferred decl with this name, remember that we need to emit it at the end // of the file. auto DDI = DeferredDecls.find(MangledName); if (DDI != DeferredDecls.end()) { // Move the potentially referenced deferred decl to the // DeferredDeclsToEmit list, and remove it from DeferredDecls (since we // don't need it anymore). addDeferredDeclToEmit(DDI->second); DeferredDecls.erase(DDI); // Otherwise, there are cases we have to worry about where we're // using a declaration for which we must emit a definition but where // we might not find a top-level definition: // - member functions defined inline in their classes // - friend functions defined inline in some class // - special member functions with implicit definitions // If we ever change our AST traversal to walk into class methods, // this will be unnecessary. // // We also don't emit a definition for a function if it's going to be an // entry in a vtable, unless it's already marked as used. } else if (getLangOpts().CPlusPlus && D) { // Look for a declaration that's lexically in a record. for (const auto *FD = cast(D)->getMostRecentDecl(); FD; FD = FD->getPreviousDecl()) { if (isa(FD->getLexicalDeclContext())) { if (FD->doesThisDeclarationHaveABody()) { addDeferredDeclToEmit(GD.getWithDecl(FD)); break; } } } } } // Make sure the result is of the requested type. if (!IsIncompleteFunction) { assert(F->getFunctionType() == Ty); return F; } return llvm::ConstantExpr::getBitCast(F, Ty->getPointerTo(F->getAddressSpace())); } /// GetAddrOfFunction - Return the address of the given function. If Ty is /// non-null, then this function will use the specified type if it has to /// create it (this occurs when we see a definition of the function). llvm::Constant * CodeGenModule::GetAddrOfFunction(GlobalDecl GD, llvm::Type *Ty, bool ForVTable, bool DontDefer, ForDefinition_t IsForDefinition) { // If there was no specific requested type, just convert it now. if (!Ty) { const auto *FD = cast(GD.getDecl()); Ty = getTypes().ConvertType(FD->getType()); } // Devirtualized destructor calls may come through here instead of via // getAddrOfCXXStructor. Make sure we use the MS ABI base destructor instead // of the complete destructor when necessary. if (const auto *DD = dyn_cast(GD.getDecl())) { if (getTarget().getCXXABI().isMicrosoft() && GD.getDtorType() == Dtor_Complete && DD->getParent()->getNumVBases() == 0) GD = GlobalDecl(DD, Dtor_Base); } StringRef MangledName = getMangledName(GD); auto *F = GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer, /*IsThunk=*/false, llvm::AttributeList(), IsForDefinition); // Returns kernel handle for HIP kernel stub function. if (LangOpts.CUDA && !LangOpts.CUDAIsDevice && cast(GD.getDecl())->hasAttr()) { auto *Handle = getCUDARuntime().getKernelHandle( cast(F->stripPointerCasts()), GD); if (IsForDefinition) return F; return llvm::ConstantExpr::getBitCast(Handle, Ty->getPointerTo()); } return F; } llvm::Constant *CodeGenModule::GetFunctionStart(const ValueDecl *Decl) { llvm::GlobalValue *F = cast(GetAddrOfFunction(Decl)->stripPointerCasts()); return llvm::ConstantExpr::getBitCast( llvm::NoCFIValue::get(F), llvm::Type::getInt8PtrTy(VMContext, F->getAddressSpace())); } static const FunctionDecl * GetRuntimeFunctionDecl(ASTContext &C, StringRef Name) { TranslationUnitDecl *TUDecl = C.getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); IdentifierInfo &CII = C.Idents.get(Name); for (const auto *Result : DC->lookup(&CII)) if (const auto *FD = dyn_cast(Result)) return FD; if (!C.getLangOpts().CPlusPlus) return nullptr; // Demangle the premangled name from getTerminateFn() IdentifierInfo &CXXII = (Name == "_ZSt9terminatev" || Name == "?terminate@@YAXXZ") ? C.Idents.get("terminate") : C.Idents.get(Name); for (const auto &N : {"__cxxabiv1", "std"}) { IdentifierInfo &NS = C.Idents.get(N); for (const auto *Result : DC->lookup(&NS)) { const NamespaceDecl *ND = dyn_cast(Result); if (auto *LSD = dyn_cast(Result)) for (const auto *Result : LSD->lookup(&NS)) if ((ND = dyn_cast(Result))) break; if (ND) for (const auto *Result : ND->lookup(&CXXII)) if (const auto *FD = dyn_cast(Result)) return FD; } } return nullptr; } /// CreateRuntimeFunction - Create a new runtime function with the specified /// type and name. llvm::FunctionCallee CodeGenModule::CreateRuntimeFunction(llvm::FunctionType *FTy, StringRef Name, llvm::AttributeList ExtraAttrs, bool Local, bool AssumeConvergent) { if (AssumeConvergent) { ExtraAttrs = ExtraAttrs.addFnAttribute(VMContext, llvm::Attribute::Convergent); } llvm::Constant *C = GetOrCreateLLVMFunction(Name, FTy, GlobalDecl(), /*ForVTable=*/false, /*DontDefer=*/false, /*IsThunk=*/false, ExtraAttrs); if (auto *F = dyn_cast(C)) { if (F->empty()) { F->setCallingConv(getRuntimeCC()); // In Windows Itanium environments, try to mark runtime functions // dllimport. For Mingw and MSVC, don't. We don't really know if the user // will link their standard library statically or dynamically. Marking // functions imported when they are not imported can cause linker errors // and warnings. if (!Local && getTriple().isWindowsItaniumEnvironment() && !getCodeGenOpts().LTOVisibilityPublicStd) { const FunctionDecl *FD = GetRuntimeFunctionDecl(Context, Name); if (!FD || FD->hasAttr()) { F->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass); F->setLinkage(llvm::GlobalValue::ExternalLinkage); } } setDSOLocal(F); } } return {FTy, C}; } /// isTypeConstant - Determine whether an object of this type can be emitted /// as a constant. /// /// If ExcludeCtor is true, the duration when the object's constructor runs /// will not be considered. The caller will need to verify that the object is /// not written to during its construction. ExcludeDtor works similarly. bool CodeGenModule::isTypeConstant(QualType Ty, bool ExcludeCtor, bool ExcludeDtor) { if (!Ty.isConstant(Context) && !Ty->isReferenceType()) return false; if (Context.getLangOpts().CPlusPlus) { if (const CXXRecordDecl *Record = Context.getBaseElementType(Ty)->getAsCXXRecordDecl()) return ExcludeCtor && !Record->hasMutableFields() && (Record->hasTrivialDestructor() || ExcludeDtor); } return true; } /// GetOrCreateLLVMGlobal - If the specified mangled name is not in the module, /// create and return an llvm GlobalVariable with the specified type and address /// space. If there is something in the module with the specified name, return /// it potentially bitcasted to the right type. /// /// If D is non-null, it specifies a decl that correspond to this. This is used /// to set the attributes on the global when it is first created. /// /// If IsForDefinition is true, it is guaranteed that an actual global with /// type Ty will be returned, not conversion of a variable with the same /// mangled name but some other type. llvm::Constant * CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, const VarDecl *D, ForDefinition_t IsForDefinition) { // Lookup the entry, lazily creating it if necessary. llvm::GlobalValue *Entry = GetGlobalValue(MangledName); unsigned TargetAS = getContext().getTargetAddressSpace(AddrSpace); if (Entry) { if (WeakRefReferences.erase(Entry)) { if (D && !D->hasAttr()) Entry->setLinkage(llvm::Function::ExternalLinkage); } // Handle dropped DLL attributes. if (D && !D->hasAttr() && !D->hasAttr() && !shouldMapVisibilityToDLLExport(D)) Entry->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); if (LangOpts.OpenMP && !LangOpts.OpenMPSimd && D) getOpenMPRuntime().registerTargetGlobalVariable(D, Entry); if (Entry->getValueType() == Ty && Entry->getAddressSpace() == TargetAS) return Entry; // If there are two attempts to define the same mangled name, issue an // error. if (IsForDefinition && !Entry->isDeclaration()) { GlobalDecl OtherGD; const VarDecl *OtherD; // Check that D is not yet in DiagnosedConflictingDefinitions is required // to make sure that we issue an error only once. if (D && lookupRepresentativeDecl(MangledName, OtherGD) && (D->getCanonicalDecl() != OtherGD.getCanonicalDecl().getDecl()) && (OtherD = dyn_cast(OtherGD.getDecl())) && OtherD->hasInit() && DiagnosedConflictingDefinitions.insert(D).second) { getDiags().Report(D->getLocation(), diag::err_duplicate_mangled_name) << MangledName; getDiags().Report(OtherGD.getDecl()->getLocation(), diag::note_previous_definition); } } // Make sure the result is of the correct type. if (Entry->getType()->getAddressSpace() != TargetAS) { return llvm::ConstantExpr::getAddrSpaceCast(Entry, Ty->getPointerTo(TargetAS)); } // (If global is requested for a definition, we always need to create a new // global, not just return a bitcast.) if (!IsForDefinition) return llvm::ConstantExpr::getBitCast(Entry, Ty->getPointerTo(TargetAS)); } auto DAddrSpace = GetGlobalVarAddressSpace(D); auto *GV = new llvm::GlobalVariable( getModule(), Ty, false, llvm::GlobalValue::ExternalLinkage, nullptr, MangledName, nullptr, llvm::GlobalVariable::NotThreadLocal, getContext().getTargetAddressSpace(DAddrSpace)); // If we already created a global with the same mangled name (but different // type) before, take its name and remove it from its parent. if (Entry) { GV->takeName(Entry); if (!Entry->use_empty()) { llvm::Constant *NewPtrForOldDecl = llvm::ConstantExpr::getBitCast(GV, Entry->getType()); Entry->replaceAllUsesWith(NewPtrForOldDecl); } Entry->eraseFromParent(); } // This is the first use or definition of a mangled name. If there is a // deferred decl with this name, remember that we need to emit it at the end // of the file. auto DDI = DeferredDecls.find(MangledName); if (DDI != DeferredDecls.end()) { // Move the potentially referenced deferred decl to the DeferredDeclsToEmit // list, and remove it from DeferredDecls (since we don't need it anymore). addDeferredDeclToEmit(DDI->second); DeferredDecls.erase(DDI); } // Handle things which are present even on external declarations. if (D) { if (LangOpts.OpenMP && !LangOpts.OpenMPSimd) getOpenMPRuntime().registerTargetGlobalVariable(D, GV); // FIXME: This code is overly simple and should be merged with other global // handling. GV->setConstant(isTypeConstant(D->getType(), false, false)); GV->setAlignment(getContext().getDeclAlign(D).getAsAlign()); setLinkageForGV(GV, D); if (D->getTLSKind()) { if (D->getTLSKind() == VarDecl::TLS_Dynamic) CXXThreadLocals.push_back(D); setTLSMode(GV, *D); } setGVProperties(GV, D); // If required by the ABI, treat declarations of static data members with // inline initializers as definitions. if (getContext().isMSStaticDataMemberInlineDefinition(D)) { EmitGlobalVarDefinition(D); } // Emit section information for extern variables. if (D->hasExternalStorage()) { if (const SectionAttr *SA = D->getAttr()) GV->setSection(SA->getName()); } // Handle XCore specific ABI requirements. if (getTriple().getArch() == llvm::Triple::xcore && D->getLanguageLinkage() == CLanguageLinkage && D->getType().isConstant(Context) && isExternallyVisible(D->getLinkageAndVisibility().getLinkage())) GV->setSection(".cp.rodata"); // Check if we a have a const declaration with an initializer, we may be // able to emit it as available_externally to expose it's value to the // optimizer. if (Context.getLangOpts().CPlusPlus && GV->hasExternalLinkage() && D->getType().isConstQualified() && !GV->hasInitializer() && !D->hasDefinition() && D->hasInit() && !D->hasAttr()) { const auto *Record = Context.getBaseElementType(D->getType())->getAsCXXRecordDecl(); bool HasMutableFields = Record && Record->hasMutableFields(); if (!HasMutableFields) { const VarDecl *InitDecl; const Expr *InitExpr = D->getAnyInitializer(InitDecl); if (InitExpr) { ConstantEmitter emitter(*this); llvm::Constant *Init = emitter.tryEmitForInitializer(*InitDecl); if (Init) { auto *InitType = Init->getType(); if (GV->getValueType() != InitType) { // The type of the initializer does not match the definition. // This happens when an initializer has a different type from // the type of the global (because of padding at the end of a // structure for instance). GV->setName(StringRef()); // Make a new global with the correct type, this is now guaranteed // to work. auto *NewGV = cast( GetAddrOfGlobalVar(D, InitType, IsForDefinition) ->stripPointerCasts()); // Erase the old global, since it is no longer used. GV->eraseFromParent(); GV = NewGV; } else { GV->setInitializer(Init); GV->setConstant(true); GV->setLinkage(llvm::GlobalValue::AvailableExternallyLinkage); } emitter.finalize(GV); } } } } } if (D && D->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly) { getTargetCodeGenInfo().setTargetAttributes(D, GV, *this); // External HIP managed variables needed to be recorded for transformation // in both device and host compilations. if (getLangOpts().CUDA && D && D->hasAttr() && D->hasExternalStorage()) getCUDARuntime().handleVarRegistration(D, *GV); } if (D) SanitizerMD->reportGlobal(GV, *D); LangAS ExpectedAS = D ? D->getType().getAddressSpace() : (LangOpts.OpenCL ? LangAS::opencl_global : LangAS::Default); assert(getContext().getTargetAddressSpace(ExpectedAS) == TargetAS); if (DAddrSpace != ExpectedAS) { return getTargetCodeGenInfo().performAddrSpaceCast( *this, GV, DAddrSpace, ExpectedAS, Ty->getPointerTo(TargetAS)); } return GV; } llvm::Constant * CodeGenModule::GetAddrOfGlobal(GlobalDecl GD, ForDefinition_t IsForDefinition) { const Decl *D = GD.getDecl(); if (isa(D) || isa(D)) return getAddrOfCXXStructor(GD, /*FnInfo=*/nullptr, /*FnType=*/nullptr, /*DontDefer=*/false, IsForDefinition); if (isa(D)) { auto FInfo = &getTypes().arrangeCXXMethodDeclaration(cast(D)); auto Ty = getTypes().GetFunctionType(*FInfo); return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/false, IsForDefinition); } if (isa(D)) { const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::FunctionType *Ty = getTypes().GetFunctionType(FI); return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/false, IsForDefinition); } return GetAddrOfGlobalVar(cast(D), /*Ty=*/nullptr, IsForDefinition); } llvm::GlobalVariable *CodeGenModule::CreateOrReplaceCXXRuntimeVariable( StringRef Name, llvm::Type *Ty, llvm::GlobalValue::LinkageTypes Linkage, llvm::Align Alignment) { llvm::GlobalVariable *GV = getModule().getNamedGlobal(Name); llvm::GlobalVariable *OldGV = nullptr; if (GV) { // Check if the variable has the right type. if (GV->getValueType() == Ty) return GV; // Because C++ name mangling, the only way we can end up with an already // existing global with the same name is if it has been declared extern "C". assert(GV->isDeclaration() && "Declaration has wrong type!"); OldGV = GV; } // Create a new variable. GV = new llvm::GlobalVariable(getModule(), Ty, /*isConstant=*/true, Linkage, nullptr, Name); if (OldGV) { // Replace occurrences of the old variable if needed. GV->takeName(OldGV); if (!OldGV->use_empty()) { llvm::Constant *NewPtrForOldDecl = llvm::ConstantExpr::getBitCast(GV, OldGV->getType()); OldGV->replaceAllUsesWith(NewPtrForOldDecl); } OldGV->eraseFromParent(); } if (supportsCOMDAT() && GV->isWeakForLinker() && !GV->hasAvailableExternallyLinkage()) GV->setComdat(TheModule.getOrInsertComdat(GV->getName())); GV->setAlignment(Alignment); return GV; } /// GetAddrOfGlobalVar - Return the llvm::Constant for the address of the /// given global variable. If Ty is non-null and if the global doesn't exist, /// then it will be created with the specified type instead of whatever the /// normal requested type would be. If IsForDefinition is true, it is guaranteed /// that an actual global with type Ty will be returned, not conversion of a /// variable with the same mangled name but some other type. llvm::Constant *CodeGenModule::GetAddrOfGlobalVar(const VarDecl *D, llvm::Type *Ty, ForDefinition_t IsForDefinition) { assert(D->hasGlobalStorage() && "Not a global variable"); QualType ASTTy = D->getType(); if (!Ty) Ty = getTypes().ConvertTypeForMem(ASTTy); StringRef MangledName = getMangledName(D); return GetOrCreateLLVMGlobal(MangledName, Ty, ASTTy.getAddressSpace(), D, IsForDefinition); } /// CreateRuntimeVariable - Create a new runtime global variable with the /// specified type and name. llvm::Constant * CodeGenModule::CreateRuntimeVariable(llvm::Type *Ty, StringRef Name) { LangAS AddrSpace = getContext().getLangOpts().OpenCL ? LangAS::opencl_global : LangAS::Default; auto *Ret = GetOrCreateLLVMGlobal(Name, Ty, AddrSpace, nullptr); setDSOLocal(cast(Ret->stripPointerCasts())); return Ret; } void CodeGenModule::EmitTentativeDefinition(const VarDecl *D) { assert(!D->getInit() && "Cannot emit definite definitions here!"); StringRef MangledName = getMangledName(D); llvm::GlobalValue *GV = GetGlobalValue(MangledName); // We already have a definition, not declaration, with the same mangled name. // Emitting of declaration is not required (and actually overwrites emitted // definition). if (GV && !GV->isDeclaration()) return; // If we have not seen a reference to this variable yet, place it into the // deferred declarations table to be emitted if needed later. if (!MustBeEmitted(D) && !GV) { DeferredDecls[MangledName] = D; return; } // The tentative definition is the only definition. EmitGlobalVarDefinition(D); } void CodeGenModule::EmitExternalDeclaration(const VarDecl *D) { EmitExternalVarDeclaration(D); } CharUnits CodeGenModule::GetTargetTypeStoreSize(llvm::Type *Ty) const { return Context.toCharUnitsFromBits( getDataLayout().getTypeStoreSizeInBits(Ty)); } LangAS CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D) { if (LangOpts.OpenCL) { LangAS AS = D ? D->getType().getAddressSpace() : LangAS::opencl_global; assert(AS == LangAS::opencl_global || AS == LangAS::opencl_global_device || AS == LangAS::opencl_global_host || AS == LangAS::opencl_constant || AS == LangAS::opencl_local || AS >= LangAS::FirstTargetAddressSpace); return AS; } if (LangOpts.SYCLIsDevice && (!D || D->getType().getAddressSpace() == LangAS::Default)) return LangAS::sycl_global; if (LangOpts.CUDA && LangOpts.CUDAIsDevice) { if (D) { if (D->hasAttr()) return LangAS::cuda_constant; if (D->hasAttr()) return LangAS::cuda_shared; if (D->hasAttr()) return LangAS::cuda_device; if (D->getType().isConstQualified()) return LangAS::cuda_constant; } return LangAS::cuda_device; } if (LangOpts.OpenMP) { LangAS AS; if (OpenMPRuntime->hasAllocateAttributeForGlobalVar(D, AS)) return AS; } return getTargetCodeGenInfo().getGlobalVarAddressSpace(*this, D); } LangAS CodeGenModule::GetGlobalConstantAddressSpace() const { // OpenCL v1.2 s6.5.3: a string literal is in the constant address space. if (LangOpts.OpenCL) return LangAS::opencl_constant; if (LangOpts.SYCLIsDevice) return LangAS::sycl_global; if (LangOpts.HIP && LangOpts.CUDAIsDevice && getTriple().isSPIRV()) // For HIPSPV map literals to cuda_device (maps to CrossWorkGroup in SPIR-V) // instead of default AS (maps to Generic in SPIR-V). Otherwise, we end up // with OpVariable instructions with Generic storage class which is not // allowed (SPIR-V V1.6 s3.42.8). Also, mapping literals to SPIR-V // UniformConstant storage class is not viable as pointers to it may not be // casted to Generic pointers which are used to model HIP's "flat" pointers. return LangAS::cuda_device; if (auto AS = getTarget().getConstantAddressSpace()) return *AS; return LangAS::Default; } // In address space agnostic languages, string literals are in default address // space in AST. However, certain targets (e.g. amdgcn) request them to be // emitted in constant address space in LLVM IR. To be consistent with other // parts of AST, string literal global variables in constant address space // need to be casted to default address space before being put into address // map and referenced by other part of CodeGen. // In OpenCL, string literals are in constant address space in AST, therefore // they should not be casted to default address space. static llvm::Constant * castStringLiteralToDefaultAddressSpace(CodeGenModule &CGM, llvm::GlobalVariable *GV) { llvm::Constant *Cast = GV; if (!CGM.getLangOpts().OpenCL) { auto AS = CGM.GetGlobalConstantAddressSpace(); if (AS != LangAS::Default) Cast = CGM.getTargetCodeGenInfo().performAddrSpaceCast( CGM, GV, AS, LangAS::Default, GV->getValueType()->getPointerTo( CGM.getContext().getTargetAddressSpace(LangAS::Default))); } return Cast; } template void CodeGenModule::MaybeHandleStaticInExternC(const SomeDecl *D, llvm::GlobalValue *GV) { if (!getLangOpts().CPlusPlus) return; // Must have 'used' attribute, or else inline assembly can't rely on // the name existing. if (!D->template hasAttr()) return; // Must have internal linkage and an ordinary name. if (!D->getIdentifier() || D->getFormalLinkage() != InternalLinkage) return; // Must be in an extern "C" context. Entities declared directly within // a record are not extern "C" even if the record is in such a context. const SomeDecl *First = D->getFirstDecl(); if (First->getDeclContext()->isRecord() || !First->isInExternCContext()) return; // OK, this is an internal linkage entity inside an extern "C" linkage // specification. Make a note of that so we can give it the "expected" // mangled name if nothing else is using that name. std::pair R = StaticExternCValues.insert(std::make_pair(D->getIdentifier(), GV)); // If we have multiple internal linkage entities with the same name // in extern "C" regions, none of them gets that name. if (!R.second) R.first->second = nullptr; } static bool shouldBeInCOMDAT(CodeGenModule &CGM, const Decl &D) { if (!CGM.supportsCOMDAT()) return false; if (D.hasAttr()) return true; GVALinkage Linkage; if (auto *VD = dyn_cast(&D)) Linkage = CGM.getContext().GetGVALinkageForVariable(VD); else Linkage = CGM.getContext().GetGVALinkageForFunction(cast(&D)); switch (Linkage) { case GVA_Internal: case GVA_AvailableExternally: case GVA_StrongExternal: return false; case GVA_DiscardableODR: case GVA_StrongODR: return true; } llvm_unreachable("No such linkage"); } bool CodeGenModule::supportsCOMDAT() const { return getTriple().supportsCOMDAT(); } void CodeGenModule::maybeSetTrivialComdat(const Decl &D, llvm::GlobalObject &GO) { if (!shouldBeInCOMDAT(*this, D)) return; GO.setComdat(TheModule.getOrInsertComdat(GO.getName())); } /// Pass IsTentative as true if you want to create a tentative definition. void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D, bool IsTentative) { // OpenCL global variables of sampler type are translated to function calls, // therefore no need to be translated. QualType ASTTy = D->getType(); if (getLangOpts().OpenCL && ASTTy->isSamplerT()) return; // If this is OpenMP device, check if it is legal to emit this global // normally. if (LangOpts.OpenMPIsTargetDevice && OpenMPRuntime && OpenMPRuntime->emitTargetGlobalVariable(D)) return; llvm::TrackingVH Init; bool NeedsGlobalCtor = false; // Whether the definition of the variable is available externally. // If yes, we shouldn't emit the GloablCtor and GlobalDtor for the variable // since this is the job for its original source. bool IsDefinitionAvailableExternally = getContext().GetGVALinkageForVariable(D) == GVA_AvailableExternally; bool NeedsGlobalDtor = !IsDefinitionAvailableExternally && D->needsDestruction(getContext()) == QualType::DK_cxx_destructor; const VarDecl *InitDecl; const Expr *InitExpr = D->getAnyInitializer(InitDecl); std::optional emitter; // CUDA E.2.4.1 "__shared__ variables cannot have an initialization // as part of their declaration." Sema has already checked for // error cases, so we just need to set Init to UndefValue. bool IsCUDASharedVar = getLangOpts().CUDAIsDevice && D->hasAttr(); // Shadows of initialized device-side global variables are also left // undefined. // Managed Variables should be initialized on both host side and device side. bool IsCUDAShadowVar = !getLangOpts().CUDAIsDevice && !D->hasAttr() && (D->hasAttr() || D->hasAttr() || D->hasAttr()); bool IsCUDADeviceShadowVar = getLangOpts().CUDAIsDevice && !D->hasAttr() && (D->getType()->isCUDADeviceBuiltinSurfaceType() || D->getType()->isCUDADeviceBuiltinTextureType()); if (getLangOpts().CUDA && (IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar)) Init = llvm::UndefValue::get(getTypes().ConvertTypeForMem(ASTTy)); else if (D->hasAttr()) Init = llvm::UndefValue::get(getTypes().ConvertTypeForMem(ASTTy)); else if (!InitExpr) { // This is a tentative definition; tentative definitions are // implicitly initialized with { 0 }. // // Note that tentative definitions are only emitted at the end of // a translation unit, so they should never have incomplete // type. In addition, EmitTentativeDefinition makes sure that we // never attempt to emit a tentative definition if a real one // exists. A use may still exists, however, so we still may need // to do a RAUW. assert(!ASTTy->isIncompleteType() && "Unexpected incomplete type"); Init = EmitNullConstant(D->getType()); } else { initializedGlobalDecl = GlobalDecl(D); emitter.emplace(*this); llvm::Constant *Initializer = emitter->tryEmitForInitializer(*InitDecl); if (!Initializer) { QualType T = InitExpr->getType(); if (D->getType()->isReferenceType()) T = D->getType(); if (getLangOpts().CPlusPlus) { if (InitDecl->hasFlexibleArrayInit(getContext())) ErrorUnsupported(D, "flexible array initializer"); Init = EmitNullConstant(T); if (!IsDefinitionAvailableExternally) NeedsGlobalCtor = true; } else { ErrorUnsupported(D, "static initializer"); Init = llvm::UndefValue::get(getTypes().ConvertType(T)); } } else { Init = Initializer; // We don't need an initializer, so remove the entry for the delayed // initializer position (just in case this entry was delayed) if we // also don't need to register a destructor. if (getLangOpts().CPlusPlus && !NeedsGlobalDtor) DelayedCXXInitPosition.erase(D); #ifndef NDEBUG CharUnits VarSize = getContext().getTypeSizeInChars(ASTTy) + InitDecl->getFlexibleArrayInitChars(getContext()); CharUnits CstSize = CharUnits::fromQuantity( getDataLayout().getTypeAllocSize(Init->getType())); assert(VarSize == CstSize && "Emitted constant has unexpected size"); #endif } } llvm::Type* InitType = Init->getType(); llvm::Constant *Entry = GetAddrOfGlobalVar(D, InitType, ForDefinition_t(!IsTentative)); // Strip off pointer casts if we got them. Entry = Entry->stripPointerCasts(); // Entry is now either a Function or GlobalVariable. auto *GV = dyn_cast(Entry); // We have a definition after a declaration with the wrong type. // We must make a new GlobalVariable* and update everything that used OldGV // (a declaration or tentative definition) with the new GlobalVariable* // (which will be a definition). // // This happens if there is a prototype for a global (e.g. // "extern int x[];") and then a definition of a different type (e.g. // "int x[10];"). This also happens when an initializer has a different type // from the type of the global (this happens with unions). if (!GV || GV->getValueType() != InitType || GV->getType()->getAddressSpace() != getContext().getTargetAddressSpace(GetGlobalVarAddressSpace(D))) { // Move the old entry aside so that we'll create a new one. Entry->setName(StringRef()); // Make a new global with the correct type, this is now guaranteed to work. GV = cast( GetAddrOfGlobalVar(D, InitType, ForDefinition_t(!IsTentative)) ->stripPointerCasts()); // Replace all uses of the old global with the new global llvm::Constant *NewPtrForOldDecl = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Entry->getType()); Entry->replaceAllUsesWith(NewPtrForOldDecl); // Erase the old global, since it is no longer used. cast(Entry)->eraseFromParent(); } MaybeHandleStaticInExternC(D, GV); if (D->hasAttr()) AddGlobalAnnotations(D, GV); // Set the llvm linkage type as appropriate. llvm::GlobalValue::LinkageTypes Linkage = getLLVMLinkageVarDefinition(D); // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on // the device. [...]" // CUDA B.2.2 "The __constant__ qualifier, optionally used together with // __device__, declares a variable that: [...] // Is accessible from all the threads within the grid and from the host // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize() // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())." if (LangOpts.CUDA) { if (LangOpts.CUDAIsDevice) { if (Linkage != llvm::GlobalValue::InternalLinkage && (D->hasAttr() || D->hasAttr() || D->getType()->isCUDADeviceBuiltinSurfaceType() || D->getType()->isCUDADeviceBuiltinTextureType())) GV->setExternallyInitialized(true); } else { getCUDARuntime().internalizeDeviceSideVar(D, Linkage); } getCUDARuntime().handleVarRegistration(D, *GV); } GV->setInitializer(Init); if (emitter) emitter->finalize(GV); // If it is safe to mark the global 'constant', do so now. GV->setConstant(!NeedsGlobalCtor && !NeedsGlobalDtor && isTypeConstant(D->getType(), true, true)); // If it is in a read-only section, mark it 'constant'. if (const SectionAttr *SA = D->getAttr()) { const ASTContext::SectionInfo &SI = Context.SectionInfos[SA->getName()]; if ((SI.SectionFlags & ASTContext::PSF_Write) == 0) GV->setConstant(true); } CharUnits AlignVal = getContext().getDeclAlign(D); // Check for alignment specifed in an 'omp allocate' directive. if (std::optional AlignValFromAllocate = getOMPAllocateAlignment(D)) AlignVal = *AlignValFromAllocate; GV->setAlignment(AlignVal.getAsAlign()); // On Darwin, unlike other Itanium C++ ABI platforms, the thread-wrapper // function is only defined alongside the variable, not also alongside // callers. Normally, all accesses to a thread_local go through the // thread-wrapper in order to ensure initialization has occurred, underlying // variable will never be used other than the thread-wrapper, so it can be // converted to internal linkage. // // However, if the variable has the 'constinit' attribute, it _can_ be // referenced directly, without calling the thread-wrapper, so the linkage // must not be changed. // // Additionally, if the variable isn't plain external linkage, e.g. if it's // weak or linkonce, the de-duplication semantics are important to preserve, // so we don't change the linkage. if (D->getTLSKind() == VarDecl::TLS_Dynamic && Linkage == llvm::GlobalValue::ExternalLinkage && Context.getTargetInfo().getTriple().isOSDarwin() && !D->hasAttr()) Linkage = llvm::GlobalValue::InternalLinkage; GV->setLinkage(Linkage); if (D->hasAttr()) GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass); else if (D->hasAttr()) GV->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass); else GV->setDLLStorageClass(llvm::GlobalVariable::DefaultStorageClass); if (Linkage == llvm::GlobalVariable::CommonLinkage) { // common vars aren't constant even if declared const. GV->setConstant(false); // Tentative definition of global variables may be initialized with // non-zero null pointers. In this case they should have weak linkage // since common linkage must have zero initializer and must not have // explicit section therefore cannot have non-zero initial value. if (!GV->getInitializer()->isNullValue()) GV->setLinkage(llvm::GlobalVariable::WeakAnyLinkage); } setNonAliasAttributes(D, GV); if (D->getTLSKind() && !GV->isThreadLocal()) { if (D->getTLSKind() == VarDecl::TLS_Dynamic) CXXThreadLocals.push_back(D); setTLSMode(GV, *D); } maybeSetTrivialComdat(*D, *GV); // Emit the initializer function if necessary. if (NeedsGlobalCtor || NeedsGlobalDtor) EmitCXXGlobalVarDeclInitFunc(D, GV, NeedsGlobalCtor); SanitizerMD->reportGlobal(GV, *D, NeedsGlobalCtor); // Emit global variable debug information. if (CGDebugInfo *DI = getModuleDebugInfo()) if (getCodeGenOpts().hasReducedDebugInfo()) DI->EmitGlobalVariable(GV, D); } void CodeGenModule::EmitExternalVarDeclaration(const VarDecl *D) { if (CGDebugInfo *DI = getModuleDebugInfo()) if (getCodeGenOpts().hasReducedDebugInfo()) { QualType ASTTy = D->getType(); llvm::Type *Ty = getTypes().ConvertTypeForMem(D->getType()); llvm::Constant *GV = GetOrCreateLLVMGlobal(D->getName(), Ty, ASTTy.getAddressSpace(), D); DI->EmitExternalVariable( cast(GV->stripPointerCasts()), D); } } static bool isVarDeclStrongDefinition(const ASTContext &Context, CodeGenModule &CGM, const VarDecl *D, bool NoCommon) { // Don't give variables common linkage if -fno-common was specified unless it // was overridden by a NoCommon attribute. if ((NoCommon || D->hasAttr()) && !D->hasAttr()) return true; // C11 6.9.2/2: // A declaration of an identifier for an object that has file scope without // an initializer, and without a storage-class specifier or with the // storage-class specifier static, constitutes a tentative definition. if (D->getInit() || D->hasExternalStorage()) return true; // A variable cannot be both common and exist in a section. if (D->hasAttr()) return true; // A variable cannot be both common and exist in a section. // We don't try to determine which is the right section in the front-end. // If no specialized section name is applicable, it will resort to default. if (D->hasAttr() || D->hasAttr() || D->hasAttr() || D->hasAttr()) return true; // Thread local vars aren't considered common linkage. if (D->getTLSKind()) return true; // Tentative definitions marked with WeakImportAttr are true definitions. if (D->hasAttr()) return true; // A variable cannot be both common and exist in a comdat. if (shouldBeInCOMDAT(CGM, *D)) return true; // Declarations with a required alignment do not have common linkage in MSVC // mode. if (Context.getTargetInfo().getCXXABI().isMicrosoft()) { if (D->hasAttr()) return true; QualType VarType = D->getType(); if (Context.isAlignmentRequired(VarType)) return true; if (const auto *RT = VarType->getAs()) { const RecordDecl *RD = RT->getDecl(); for (const FieldDecl *FD : RD->fields()) { if (FD->isBitField()) continue; if (FD->hasAttr()) return true; if (Context.isAlignmentRequired(FD->getType())) return true; } } } // Microsoft's link.exe doesn't support alignments greater than 32 bytes for // common symbols, so symbols with greater alignment requirements cannot be // common. // Other COFF linkers (ld.bfd and LLD) support arbitrary power-of-two // alignments for common symbols via the aligncomm directive, so this // restriction only applies to MSVC environments. if (Context.getTargetInfo().getTriple().isKnownWindowsMSVCEnvironment() && Context.getTypeAlignIfKnown(D->getType()) > Context.toBits(CharUnits::fromQuantity(32))) return true; return false; } llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageForDeclarator(const DeclaratorDecl *D, GVALinkage Linkage) { if (Linkage == GVA_Internal) return llvm::Function::InternalLinkage; if (D->hasAttr()) return llvm::GlobalVariable::WeakAnyLinkage; if (const auto *FD = D->getAsFunction()) if (FD->isMultiVersion() && Linkage == GVA_AvailableExternally) return llvm::GlobalVariable::LinkOnceAnyLinkage; // We are guaranteed to have a strong definition somewhere else, // so we can use available_externally linkage. if (Linkage == GVA_AvailableExternally) return llvm::GlobalValue::AvailableExternallyLinkage; // Note that Apple's kernel linker doesn't support symbol // coalescing, so we need to avoid linkonce and weak linkages there. // Normally, this means we just map to internal, but for explicit // instantiations we'll map to external. // In C++, the compiler has to emit a definition in every translation unit // that references the function. We should use linkonce_odr because // a) if all references in this translation unit are optimized away, we // don't need to codegen it. b) if the function persists, it needs to be // merged with other definitions. c) C++ has the ODR, so we know the // definition is dependable. if (Linkage == GVA_DiscardableODR) return !Context.getLangOpts().AppleKext ? llvm::Function::LinkOnceODRLinkage : llvm::Function::InternalLinkage; // An explicit instantiation of a template has weak linkage, since // explicit instantiations can occur in multiple translation units // and must all be equivalent. However, we are not allowed to // throw away these explicit instantiations. // // CUDA/HIP: For -fno-gpu-rdc case, device code is limited to one TU, // so say that CUDA templates are either external (for kernels) or internal. // This lets llvm perform aggressive inter-procedural optimizations. For // -fgpu-rdc case, device function calls across multiple TU's are allowed, // therefore we need to follow the normal linkage paradigm. if (Linkage == GVA_StrongODR) { if (getLangOpts().AppleKext) return llvm::Function::ExternalLinkage; if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice && !getLangOpts().GPURelocatableDeviceCode) return D->hasAttr() ? llvm::Function::ExternalLinkage : llvm::Function::InternalLinkage; return llvm::Function::WeakODRLinkage; } // C++ doesn't have tentative definitions and thus cannot have common // linkage. if (!getLangOpts().CPlusPlus && isa(D) && !isVarDeclStrongDefinition(Context, *this, cast(D), CodeGenOpts.NoCommon)) return llvm::GlobalVariable::CommonLinkage; // selectany symbols are externally visible, so use weak instead of // linkonce. MSVC optimizes away references to const selectany globals, so // all definitions should be the same and ODR linkage should be used. // http://msdn.microsoft.com/en-us/library/5tkz6s71.aspx if (D->hasAttr()) return llvm::GlobalVariable::WeakODRLinkage; // Otherwise, we have strong external linkage. assert(Linkage == GVA_StrongExternal); return llvm::GlobalVariable::ExternalLinkage; } llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageVarDefinition(const VarDecl *VD) { GVALinkage Linkage = getContext().GetGVALinkageForVariable(VD); return getLLVMLinkageForDeclarator(VD, Linkage); } /// Replace the uses of a function that was declared with a non-proto type. /// We want to silently drop extra arguments from call sites static void replaceUsesOfNonProtoConstant(llvm::Constant *old, llvm::Function *newFn) { // Fast path. if (old->use_empty()) return; llvm::Type *newRetTy = newFn->getReturnType(); SmallVector newArgs; for (llvm::Value::use_iterator ui = old->use_begin(), ue = old->use_end(); ui != ue; ) { llvm::Value::use_iterator use = ui++; // Increment before the use is erased. llvm::User *user = use->getUser(); // Recognize and replace uses of bitcasts. Most calls to // unprototyped functions will use bitcasts. if (auto *bitcast = dyn_cast(user)) { if (bitcast->getOpcode() == llvm::Instruction::BitCast) replaceUsesOfNonProtoConstant(bitcast, newFn); continue; } // Recognize calls to the function. llvm::CallBase *callSite = dyn_cast(user); if (!callSite) continue; if (!callSite->isCallee(&*use)) continue; // If the return types don't match exactly, then we can't // transform this call unless it's dead. if (callSite->getType() != newRetTy && !callSite->use_empty()) continue; // Get the call site's attribute list. SmallVector newArgAttrs; llvm::AttributeList oldAttrs = callSite->getAttributes(); // If the function was passed too few arguments, don't transform. unsigned newNumArgs = newFn->arg_size(); if (callSite->arg_size() < newNumArgs) continue; // If extra arguments were passed, we silently drop them. // If any of the types mismatch, we don't transform. unsigned argNo = 0; bool dontTransform = false; for (llvm::Argument &A : newFn->args()) { if (callSite->getArgOperand(argNo)->getType() != A.getType()) { dontTransform = true; break; } // Add any parameter attributes. newArgAttrs.push_back(oldAttrs.getParamAttrs(argNo)); argNo++; } if (dontTransform) continue; // Okay, we can transform this. Create the new call instruction and copy // over the required information. newArgs.append(callSite->arg_begin(), callSite->arg_begin() + argNo); // Copy over any operand bundles. SmallVector newBundles; callSite->getOperandBundlesAsDefs(newBundles); llvm::CallBase *newCall; if (isa(callSite)) { newCall = llvm::CallInst::Create(newFn, newArgs, newBundles, "", callSite); } else { auto *oldInvoke = cast(callSite); newCall = llvm::InvokeInst::Create(newFn, oldInvoke->getNormalDest(), oldInvoke->getUnwindDest(), newArgs, newBundles, "", callSite); } newArgs.clear(); // for the next iteration if (!newCall->getType()->isVoidTy()) newCall->takeName(callSite); newCall->setAttributes( llvm::AttributeList::get(newFn->getContext(), oldAttrs.getFnAttrs(), oldAttrs.getRetAttrs(), newArgAttrs)); newCall->setCallingConv(callSite->getCallingConv()); // Finally, remove the old call, replacing any uses with the new one. if (!callSite->use_empty()) callSite->replaceAllUsesWith(newCall); // Copy debug location attached to CI. if (callSite->getDebugLoc()) newCall->setDebugLoc(callSite->getDebugLoc()); callSite->eraseFromParent(); } } /// ReplaceUsesOfNonProtoTypeWithRealFunction - This function is called when we /// implement a function with no prototype, e.g. "int foo() {}". If there are /// existing call uses of the old function in the module, this adjusts them to /// call the new function directly. /// /// This is not just a cleanup: the always_inline pass requires direct calls to /// functions to be able to inline them. If there is a bitcast in the way, it /// won't inline them. Instcombine normally deletes these calls, but it isn't /// run at -O0. static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old, llvm::Function *NewFn) { // If we're redefining a global as a function, don't transform it. if (!isa(Old)) return; replaceUsesOfNonProtoConstant(Old, NewFn); } void CodeGenModule::HandleCXXStaticMemberVarInstantiation(VarDecl *VD) { auto DK = VD->isThisDeclarationADefinition(); if (DK == VarDecl::Definition && VD->hasAttr()) return; TemplateSpecializationKind TSK = VD->getTemplateSpecializationKind(); // If we have a definition, this might be a deferred decl. If the // instantiation is explicit, make sure we emit it at the end. if (VD->getDefinition() && TSK == TSK_ExplicitInstantiationDefinition) GetAddrOfGlobalVar(VD); EmitTopLevelDecl(VD); } void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD, llvm::GlobalValue *GV) { const auto *D = cast(GD.getDecl()); // Compute the function info and LLVM type. const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); llvm::FunctionType *Ty = getTypes().GetFunctionType(FI); // Get or create the prototype for the function. if (!GV || (GV->getValueType() != Ty)) GV = cast(GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/true, ForDefinition)); // Already emitted. if (!GV->isDeclaration()) return; // We need to set linkage and visibility on the function before // generating code for it because various parts of IR generation // want to propagate this information down (e.g. to local static // declarations). auto *Fn = cast(GV); setFunctionLinkage(GD, Fn); // FIXME: this is redundant with part of setFunctionDefinitionAttributes setGVProperties(Fn, GD); MaybeHandleStaticInExternC(D, Fn); maybeSetTrivialComdat(*D, *Fn); CodeGenFunction(*this).GenerateCode(GD, Fn, FI); setNonAliasAttributes(GD, Fn); SetLLVMFunctionAttributesForDefinition(D, Fn); if (const ConstructorAttr *CA = D->getAttr()) AddGlobalCtor(Fn, CA->getPriority()); if (const DestructorAttr *DA = D->getAttr()) AddGlobalDtor(Fn, DA->getPriority(), true); if (D->hasAttr()) AddGlobalAnnotations(D, Fn); } void CodeGenModule::EmitAliasDefinition(GlobalDecl GD) { const auto *D = cast(GD.getDecl()); const AliasAttr *AA = D->getAttr(); assert(AA && "Not an alias?"); StringRef MangledName = getMangledName(GD); if (AA->getAliasee() == MangledName) { Diags.Report(AA->getLocation(), diag::err_cyclic_alias) << 0; return; } // If there is a definition in the module, then it wins over the alias. // This is dubious, but allow it to be safe. Just ignore the alias. llvm::GlobalValue *Entry = GetGlobalValue(MangledName); if (Entry && !Entry->isDeclaration()) return; Aliases.push_back(GD); llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType()); // Create a reference to the named value. This ensures that it is emitted // if a deferred decl. llvm::Constant *Aliasee; llvm::GlobalValue::LinkageTypes LT; if (isa(DeclTy)) { Aliasee = GetOrCreateLLVMFunction(AA->getAliasee(), DeclTy, GD, /*ForVTable=*/false); LT = getFunctionLinkage(GD); } else { Aliasee = GetOrCreateLLVMGlobal(AA->getAliasee(), DeclTy, LangAS::Default, /*D=*/nullptr); if (const auto *VD = dyn_cast(GD.getDecl())) LT = getLLVMLinkageVarDefinition(VD); else LT = getFunctionLinkage(GD); } // Create the new alias itself, but don't set a name yet. unsigned AS = Aliasee->getType()->getPointerAddressSpace(); auto *GA = llvm::GlobalAlias::create(DeclTy, AS, LT, "", Aliasee, &getModule()); if (Entry) { if (GA->getAliasee() == Entry) { Diags.Report(AA->getLocation(), diag::err_cyclic_alias) << 0; return; } assert(Entry->isDeclaration()); // If there is a declaration in the module, then we had an extern followed // by the alias, as in: // extern int test6(); // ... // int test6() __attribute__((alias("test7"))); // // Remove it and replace uses of it with the alias. GA->takeName(Entry); Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GA, Entry->getType())); Entry->eraseFromParent(); } else { GA->setName(MangledName); } // Set attributes which are particular to an alias; this is a // specialization of the attributes which may be set on a global // variable/function. if (D->hasAttr() || D->hasAttr() || D->isWeakImported()) { GA->setLinkage(llvm::Function::WeakAnyLinkage); } if (const auto *VD = dyn_cast(D)) if (VD->getTLSKind()) setTLSMode(GA, *VD); SetCommonAttributes(GD, GA); // Emit global alias debug information. if (isa(D)) if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitGlobalAlias(cast(GA->getAliasee()->stripPointerCasts()), GD); } void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) { const auto *D = cast(GD.getDecl()); const IFuncAttr *IFA = D->getAttr(); assert(IFA && "Not an ifunc?"); StringRef MangledName = getMangledName(GD); if (IFA->getResolver() == MangledName) { Diags.Report(IFA->getLocation(), diag::err_cyclic_alias) << 1; return; } // Report an error if some definition overrides ifunc. llvm::GlobalValue *Entry = GetGlobalValue(MangledName); if (Entry && !Entry->isDeclaration()) { GlobalDecl OtherGD; if (lookupRepresentativeDecl(MangledName, OtherGD) && DiagnosedConflictingDefinitions.insert(GD).second) { Diags.Report(D->getLocation(), diag::err_duplicate_mangled_name) << MangledName; Diags.Report(OtherGD.getDecl()->getLocation(), diag::note_previous_definition); } return; } Aliases.push_back(GD); llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType()); llvm::Type *ResolverTy = llvm::GlobalIFunc::getResolverFunctionType(DeclTy); llvm::Constant *Resolver = GetOrCreateLLVMFunction(IFA->getResolver(), ResolverTy, {}, /*ForVTable=*/false); llvm::GlobalIFunc *GIF = llvm::GlobalIFunc::create(DeclTy, 0, llvm::Function::ExternalLinkage, "", Resolver, &getModule()); if (Entry) { if (GIF->getResolver() == Entry) { Diags.Report(IFA->getLocation(), diag::err_cyclic_alias) << 1; return; } assert(Entry->isDeclaration()); // If there is a declaration in the module, then we had an extern followed // by the ifunc, as in: // extern int test(); // ... // int test() __attribute__((ifunc("resolver"))); // // Remove it and replace uses of it with the ifunc. GIF->takeName(Entry); Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GIF, Entry->getType())); Entry->eraseFromParent(); } else GIF->setName(MangledName); SetCommonAttributes(GD, GIF); } llvm::Function *CodeGenModule::getIntrinsic(unsigned IID, ArrayRef Tys) { return llvm::Intrinsic::getDeclaration(&getModule(), (llvm::Intrinsic::ID)IID, Tys); } static llvm::StringMapEntry & GetConstantCFStringEntry(llvm::StringMap &Map, const StringLiteral *Literal, bool TargetIsLSB, bool &IsUTF16, unsigned &StringLength) { StringRef String = Literal->getString(); unsigned NumBytes = String.size(); // Check for simple case. if (!Literal->containsNonAsciiOrNull()) { StringLength = NumBytes; return *Map.insert(std::make_pair(String, nullptr)).first; } // Otherwise, convert the UTF8 literals into a string of shorts. IsUTF16 = true; SmallVector ToBuf(NumBytes + 1); // +1 for ending nulls. const llvm::UTF8 *FromPtr = (const llvm::UTF8 *)String.data(); llvm::UTF16 *ToPtr = &ToBuf[0]; (void)llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr, ToPtr + NumBytes, llvm::strictConversion); // ConvertUTF8toUTF16 returns the length in ToPtr. StringLength = ToPtr - &ToBuf[0]; // Add an explicit null. *ToPtr = 0; return *Map.insert(std::make_pair( StringRef(reinterpret_cast(ToBuf.data()), (StringLength + 1) * 2), nullptr)).first; } ConstantAddress CodeGenModule::GetAddrOfConstantCFString(const StringLiteral *Literal) { unsigned StringLength = 0; bool isUTF16 = false; llvm::StringMapEntry &Entry = GetConstantCFStringEntry(CFConstantStringMap, Literal, getDataLayout().isLittleEndian(), isUTF16, StringLength); if (auto *C = Entry.second) return ConstantAddress( C, C->getValueType(), CharUnits::fromQuantity(C->getAlignment())); llvm::Constant *Zero = llvm::Constant::getNullValue(Int32Ty); llvm::Constant *Zeros[] = { Zero, Zero }; const ASTContext &Context = getContext(); const llvm::Triple &Triple = getTriple(); const auto CFRuntime = getLangOpts().CFRuntime; const bool IsSwiftABI = static_cast(CFRuntime) >= static_cast(LangOptions::CoreFoundationABI::Swift); const bool IsSwift4_1 = CFRuntime == LangOptions::CoreFoundationABI::Swift4_1; // If we don't already have it, get __CFConstantStringClassReference. if (!CFConstantStringClassRef) { const char *CFConstantStringClassName = "__CFConstantStringClassReference"; llvm::Type *Ty = getTypes().ConvertType(getContext().IntTy); Ty = llvm::ArrayType::get(Ty, 0); switch (CFRuntime) { default: break; case LangOptions::CoreFoundationABI::Swift: [[fallthrough]]; case LangOptions::CoreFoundationABI::Swift5_0: CFConstantStringClassName = Triple.isOSDarwin() ? "$s15SwiftFoundation19_NSCFConstantStringCN" : "$s10Foundation19_NSCFConstantStringCN"; Ty = IntPtrTy; break; case LangOptions::CoreFoundationABI::Swift4_2: CFConstantStringClassName = Triple.isOSDarwin() ? "$S15SwiftFoundation19_NSCFConstantStringCN" : "$S10Foundation19_NSCFConstantStringCN"; Ty = IntPtrTy; break; case LangOptions::CoreFoundationABI::Swift4_1: CFConstantStringClassName = Triple.isOSDarwin() ? "__T015SwiftFoundation19_NSCFConstantStringCN" : "__T010Foundation19_NSCFConstantStringCN"; Ty = IntPtrTy; break; } llvm::Constant *C = CreateRuntimeVariable(Ty, CFConstantStringClassName); if (Triple.isOSBinFormatELF() || Triple.isOSBinFormatCOFF()) { llvm::GlobalValue *GV = nullptr; if ((GV = dyn_cast(C))) { IdentifierInfo &II = Context.Idents.get(GV->getName()); TranslationUnitDecl *TUDecl = Context.getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); const VarDecl *VD = nullptr; for (const auto *Result : DC->lookup(&II)) if ((VD = dyn_cast(Result))) break; if (Triple.isOSBinFormatELF()) { if (!VD) GV->setLinkage(llvm::GlobalValue::ExternalLinkage); } else { GV->setLinkage(llvm::GlobalValue::ExternalLinkage); if (!VD || !VD->hasAttr()) GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass); else GV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass); } setDSOLocal(GV); } } // Decay array -> ptr CFConstantStringClassRef = IsSwiftABI ? llvm::ConstantExpr::getPtrToInt(C, Ty) : llvm::ConstantExpr::getGetElementPtr(Ty, C, Zeros); } QualType CFTy = Context.getCFConstantStringType(); auto *STy = cast(getTypes().ConvertType(CFTy)); ConstantInitBuilder Builder(*this); auto Fields = Builder.beginStruct(STy); // Class pointer. Fields.add(cast(CFConstantStringClassRef)); // Flags. if (IsSwiftABI) { Fields.addInt(IntPtrTy, IsSwift4_1 ? 0x05 : 0x01); Fields.addInt(Int64Ty, isUTF16 ? 0x07d0 : 0x07c8); } else { Fields.addInt(IntTy, isUTF16 ? 0x07d0 : 0x07C8); } // String pointer. llvm::Constant *C = nullptr; if (isUTF16) { auto Arr = llvm::ArrayRef( reinterpret_cast(const_cast(Entry.first().data())), Entry.first().size() / 2); C = llvm::ConstantDataArray::get(VMContext, Arr); } else { C = llvm::ConstantDataArray::getString(VMContext, Entry.first()); } // Note: -fwritable-strings doesn't make the backing store strings of // CFStrings writable. (See ) auto *GV = new llvm::GlobalVariable(getModule(), C->getType(), /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, C, ".str"); GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); // Don't enforce the target's minimum global alignment, since the only use // of the string is via this class initializer. CharUnits Align = isUTF16 ? Context.getTypeAlignInChars(Context.ShortTy) : Context.getTypeAlignInChars(Context.CharTy); GV->setAlignment(Align.getAsAlign()); // FIXME: We set the section explicitly to avoid a bug in ld64 224.1. // Without it LLVM can merge the string with a non unnamed_addr one during // LTO. Doing that changes the section it ends in, which surprises ld64. if (Triple.isOSBinFormatMachO()) GV->setSection(isUTF16 ? "__TEXT,__ustring" : "__TEXT,__cstring,cstring_literals"); // Make sure the literal ends up in .rodata to allow for safe ICF and for // the static linker to adjust permissions to read-only later on. else if (Triple.isOSBinFormatELF()) GV->setSection(".rodata"); // String. llvm::Constant *Str = llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros); if (isUTF16) // Cast the UTF16 string to the correct type. Str = llvm::ConstantExpr::getBitCast(Str, Int8PtrTy); Fields.add(Str); // String length. llvm::IntegerType *LengthTy = llvm::IntegerType::get(getModule().getContext(), Context.getTargetInfo().getLongWidth()); if (IsSwiftABI) { if (CFRuntime == LangOptions::CoreFoundationABI::Swift4_1 || CFRuntime == LangOptions::CoreFoundationABI::Swift4_2) LengthTy = Int32Ty; else LengthTy = IntPtrTy; } Fields.addInt(LengthTy, StringLength); // Swift ABI requires 8-byte alignment to ensure that the _Atomic(uint64_t) is // properly aligned on 32-bit platforms. CharUnits Alignment = IsSwiftABI ? Context.toCharUnitsFromBits(64) : getPointerAlign(); // The struct. GV = Fields.finishAndCreateGlobal("_unnamed_cfstring_", Alignment, /*isConstant=*/false, llvm::GlobalVariable::PrivateLinkage); GV->addAttribute("objc_arc_inert"); switch (Triple.getObjectFormat()) { case llvm::Triple::UnknownObjectFormat: llvm_unreachable("unknown file format"); case llvm::Triple::DXContainer: case llvm::Triple::GOFF: case llvm::Triple::SPIRV: case llvm::Triple::XCOFF: llvm_unreachable("unimplemented"); case llvm::Triple::COFF: case llvm::Triple::ELF: case llvm::Triple::Wasm: GV->setSection("cfstring"); break; case llvm::Triple::MachO: GV->setSection("__DATA,__cfstring"); break; } Entry.second = GV; return ConstantAddress(GV, GV->getValueType(), Alignment); } bool CodeGenModule::getExpressionLocationsEnabled() const { return !CodeGenOpts.EmitCodeView || CodeGenOpts.DebugColumnInfo; } QualType CodeGenModule::getObjCFastEnumerationStateType() { if (ObjCFastEnumerationStateType.isNull()) { RecordDecl *D = Context.buildImplicitRecord("__objcFastEnumerationState"); D->startDefinition(); QualType FieldTypes[] = { Context.UnsignedLongTy, Context.getPointerType(Context.getObjCIdType()), Context.getPointerType(Context.UnsignedLongTy), Context.getConstantArrayType(Context.UnsignedLongTy, llvm::APInt(32, 5), nullptr, ArrayType::Normal, 0) }; for (size_t i = 0; i < 4; ++i) { FieldDecl *Field = FieldDecl::Create(Context, D, SourceLocation(), SourceLocation(), nullptr, FieldTypes[i], /*TInfo=*/nullptr, /*BitWidth=*/nullptr, /*Mutable=*/false, ICIS_NoInit); Field->setAccess(AS_public); D->addDecl(Field); } D->completeDefinition(); ObjCFastEnumerationStateType = Context.getTagDeclType(D); } return ObjCFastEnumerationStateType; } llvm::Constant * CodeGenModule::GetConstantArrayFromStringLiteral(const StringLiteral *E) { assert(!E->getType()->isPointerType() && "Strings are always arrays"); // Don't emit it as the address of the string, emit the string data itself // as an inline array. if (E->getCharByteWidth() == 1) { SmallString<64> Str(E->getString()); // Resize the string to the right size, which is indicated by its type. const ConstantArrayType *CAT = Context.getAsConstantArrayType(E->getType()); assert(CAT && "String literal not of constant array type!"); Str.resize(CAT->getSize().getZExtValue()); return llvm::ConstantDataArray::getString(VMContext, Str, false); } auto *AType = cast(getTypes().ConvertType(E->getType())); llvm::Type *ElemTy = AType->getElementType(); unsigned NumElements = AType->getNumElements(); // Wide strings have either 2-byte or 4-byte elements. if (ElemTy->getPrimitiveSizeInBits() == 16) { SmallVector Elements; Elements.reserve(NumElements); for(unsigned i = 0, e = E->getLength(); i != e; ++i) Elements.push_back(E->getCodeUnit(i)); Elements.resize(NumElements); return llvm::ConstantDataArray::get(VMContext, Elements); } assert(ElemTy->getPrimitiveSizeInBits() == 32); SmallVector Elements; Elements.reserve(NumElements); for(unsigned i = 0, e = E->getLength(); i != e; ++i) Elements.push_back(E->getCodeUnit(i)); Elements.resize(NumElements); return llvm::ConstantDataArray::get(VMContext, Elements); } static llvm::GlobalVariable * GenerateStringLiteral(llvm::Constant *C, llvm::GlobalValue::LinkageTypes LT, CodeGenModule &CGM, StringRef GlobalName, CharUnits Alignment) { unsigned AddrSpace = CGM.getContext().getTargetAddressSpace( CGM.GetGlobalConstantAddressSpace()); llvm::Module &M = CGM.getModule(); // Create a global variable for this string auto *GV = new llvm::GlobalVariable( M, C->getType(), !CGM.getLangOpts().WritableStrings, LT, C, GlobalName, nullptr, llvm::GlobalVariable::NotThreadLocal, AddrSpace); GV->setAlignment(Alignment.getAsAlign()); GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); if (GV->isWeakForLinker()) { assert(CGM.supportsCOMDAT() && "Only COFF uses weak string literals"); GV->setComdat(M.getOrInsertComdat(GV->getName())); } CGM.setDSOLocal(GV); return GV; } /// GetAddrOfConstantStringFromLiteral - Return a pointer to a /// constant array for the given string literal. ConstantAddress CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S, StringRef Name) { CharUnits Alignment = getContext().getAlignOfGlobalVarInChars(S->getType()); llvm::Constant *C = GetConstantArrayFromStringLiteral(S); llvm::GlobalVariable **Entry = nullptr; if (!LangOpts.WritableStrings) { Entry = &ConstantStringMap[C]; if (auto GV = *Entry) { if (uint64_t(Alignment.getQuantity()) > GV->getAlignment()) GV->setAlignment(Alignment.getAsAlign()); return ConstantAddress(castStringLiteralToDefaultAddressSpace(*this, GV), GV->getValueType(), Alignment); } } SmallString<256> MangledNameBuffer; StringRef GlobalVariableName; llvm::GlobalValue::LinkageTypes LT; // Mangle the string literal if that's how the ABI merges duplicate strings. // Don't do it if they are writable, since we don't want writes in one TU to // affect strings in another. if (getCXXABI().getMangleContext().shouldMangleStringLiteral(S) && !LangOpts.WritableStrings) { llvm::raw_svector_ostream Out(MangledNameBuffer); getCXXABI().getMangleContext().mangleStringLiteral(S, Out); LT = llvm::GlobalValue::LinkOnceODRLinkage; GlobalVariableName = MangledNameBuffer; } else { LT = llvm::GlobalValue::PrivateLinkage; GlobalVariableName = Name; } auto GV = GenerateStringLiteral(C, LT, *this, GlobalVariableName, Alignment); CGDebugInfo *DI = getModuleDebugInfo(); if (DI && getCodeGenOpts().hasReducedDebugInfo()) DI->AddStringLiteralDebugInfo(GV, S); if (Entry) *Entry = GV; SanitizerMD->reportGlobal(GV, S->getStrTokenLoc(0), ""); return ConstantAddress(castStringLiteralToDefaultAddressSpace(*this, GV), GV->getValueType(), Alignment); } /// GetAddrOfConstantStringFromObjCEncode - Return a pointer to a constant /// array for the given ObjCEncodeExpr node. ConstantAddress CodeGenModule::GetAddrOfConstantStringFromObjCEncode(const ObjCEncodeExpr *E) { std::string Str; getContext().getObjCEncodingForType(E->getEncodedType(), Str); return GetAddrOfConstantCString(Str); } /// GetAddrOfConstantCString - Returns a pointer to a character array containing /// the literal and a terminating '\0' character. /// The result has pointer to array type. ConstantAddress CodeGenModule::GetAddrOfConstantCString( const std::string &Str, const char *GlobalName) { StringRef StrWithNull(Str.c_str(), Str.size() + 1); CharUnits Alignment = getContext().getAlignOfGlobalVarInChars(getContext().CharTy); llvm::Constant *C = llvm::ConstantDataArray::getString(getLLVMContext(), StrWithNull, false); // Don't share any string literals if strings aren't constant. llvm::GlobalVariable **Entry = nullptr; if (!LangOpts.WritableStrings) { Entry = &ConstantStringMap[C]; if (auto GV = *Entry) { if (uint64_t(Alignment.getQuantity()) > GV->getAlignment()) GV->setAlignment(Alignment.getAsAlign()); return ConstantAddress(castStringLiteralToDefaultAddressSpace(*this, GV), GV->getValueType(), Alignment); } } // Get the default prefix if a name wasn't specified. if (!GlobalName) GlobalName = ".str"; // Create a global variable for this. auto GV = GenerateStringLiteral(C, llvm::GlobalValue::PrivateLinkage, *this, GlobalName, Alignment); if (Entry) *Entry = GV; return ConstantAddress(castStringLiteralToDefaultAddressSpace(*this, GV), GV->getValueType(), Alignment); } ConstantAddress CodeGenModule::GetAddrOfGlobalTemporary( const MaterializeTemporaryExpr *E, const Expr *Init) { assert((E->getStorageDuration() == SD_Static || E->getStorageDuration() == SD_Thread) && "not a global temporary"); const auto *VD = cast(E->getExtendingDecl()); // If we're not materializing a subobject of the temporary, keep the // cv-qualifiers from the type of the MaterializeTemporaryExpr. QualType MaterializedType = Init->getType(); if (Init == E->getSubExpr()) MaterializedType = E->getType(); CharUnits Align = getContext().getTypeAlignInChars(MaterializedType); auto InsertResult = MaterializedGlobalTemporaryMap.insert({E, nullptr}); if (!InsertResult.second) { // We've seen this before: either we already created it or we're in the // process of doing so. if (!InsertResult.first->second) { // We recursively re-entered this function, probably during emission of // the initializer. Create a placeholder. We'll clean this up in the // outer call, at the end of this function. llvm::Type *Type = getTypes().ConvertTypeForMem(MaterializedType); InsertResult.first->second = new llvm::GlobalVariable( getModule(), Type, false, llvm::GlobalVariable::InternalLinkage, nullptr); } return ConstantAddress(InsertResult.first->second, llvm::cast( InsertResult.first->second->stripPointerCasts()) ->getValueType(), Align); } // FIXME: If an externally-visible declaration extends multiple temporaries, // we need to give each temporary the same name in every translation unit (and // we also need to make the temporaries externally-visible). SmallString<256> Name; llvm::raw_svector_ostream Out(Name); getCXXABI().getMangleContext().mangleReferenceTemporary( VD, E->getManglingNumber(), Out); APValue *Value = nullptr; if (E->getStorageDuration() == SD_Static && VD && VD->evaluateValue()) { // If the initializer of the extending declaration is a constant // initializer, we should have a cached constant initializer for this // temporary. Note that this might have a different value from the value // computed by evaluating the initializer if the surrounding constant // expression modifies the temporary. Value = E->getOrCreateValue(false); } // Try evaluating it now, it might have a constant initializer. Expr::EvalResult EvalResult; if (!Value && Init->EvaluateAsRValue(EvalResult, getContext()) && !EvalResult.hasSideEffects()) Value = &EvalResult.Val; LangAS AddrSpace = VD ? GetGlobalVarAddressSpace(VD) : MaterializedType.getAddressSpace(); std::optional emitter; llvm::Constant *InitialValue = nullptr; bool Constant = false; llvm::Type *Type; if (Value) { // The temporary has a constant initializer, use it. emitter.emplace(*this); InitialValue = emitter->emitForInitializer(*Value, AddrSpace, MaterializedType); Constant = isTypeConstant(MaterializedType, /*ExcludeCtor*/ Value, /*ExcludeDtor*/ false); Type = InitialValue->getType(); } else { // No initializer, the initialization will be provided when we // initialize the declaration which performed lifetime extension. Type = getTypes().ConvertTypeForMem(MaterializedType); } // Create a global variable for this lifetime-extended temporary. llvm::GlobalValue::LinkageTypes Linkage = getLLVMLinkageVarDefinition(VD); if (Linkage == llvm::GlobalVariable::ExternalLinkage) { const VarDecl *InitVD; if (VD->isStaticDataMember() && VD->getAnyInitializer(InitVD) && isa(InitVD->getLexicalDeclContext())) { // Temporaries defined inside a class get linkonce_odr linkage because the // class can be defined in multiple translation units. Linkage = llvm::GlobalVariable::LinkOnceODRLinkage; } else { // There is no need for this temporary to have external linkage if the // VarDecl has external linkage. Linkage = llvm::GlobalVariable::InternalLinkage; } } auto TargetAS = getContext().getTargetAddressSpace(AddrSpace); auto *GV = new llvm::GlobalVariable( getModule(), Type, Constant, Linkage, InitialValue, Name.c_str(), /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, TargetAS); if (emitter) emitter->finalize(GV); // Don't assign dllimport or dllexport to local linkage globals. if (!llvm::GlobalValue::isLocalLinkage(Linkage)) { setGVProperties(GV, VD); if (GV->getDLLStorageClass() == llvm::GlobalVariable::DLLExportStorageClass) // The reference temporary should never be dllexport. GV->setDLLStorageClass(llvm::GlobalVariable::DefaultStorageClass); } GV->setAlignment(Align.getAsAlign()); if (supportsCOMDAT() && GV->isWeakForLinker()) GV->setComdat(TheModule.getOrInsertComdat(GV->getName())); if (VD->getTLSKind()) setTLSMode(GV, *VD); llvm::Constant *CV = GV; if (AddrSpace != LangAS::Default) CV = getTargetCodeGenInfo().performAddrSpaceCast( *this, GV, AddrSpace, LangAS::Default, Type->getPointerTo( getContext().getTargetAddressSpace(LangAS::Default))); // Update the map with the new temporary. If we created a placeholder above, // replace it with the new global now. llvm::Constant *&Entry = MaterializedGlobalTemporaryMap[E]; if (Entry) { Entry->replaceAllUsesWith( llvm::ConstantExpr::getBitCast(CV, Entry->getType())); llvm::cast(Entry)->eraseFromParent(); } Entry = CV; return ConstantAddress(CV, Type, Align); } /// EmitObjCPropertyImplementations - Emit information for synthesized /// properties for an implementation. void CodeGenModule::EmitObjCPropertyImplementations(const ObjCImplementationDecl *D) { for (const auto *PID : D->property_impls()) { // Dynamic is just for type-checking. if (PID->getPropertyImplementation() == ObjCPropertyImplDecl::Synthesize) { ObjCPropertyDecl *PD = PID->getPropertyDecl(); // Determine which methods need to be implemented, some may have // been overridden. Note that ::isPropertyAccessor is not the method // we want, that just indicates if the decl came from a // property. What we want to know is if the method is defined in // this implementation. auto *Getter = PID->getGetterMethodDecl(); if (!Getter || Getter->isSynthesizedAccessorStub()) CodeGenFunction(*this).GenerateObjCGetter( const_cast(D), PID); auto *Setter = PID->getSetterMethodDecl(); if (!PD->isReadOnly() && (!Setter || Setter->isSynthesizedAccessorStub())) CodeGenFunction(*this).GenerateObjCSetter( const_cast(D), PID); } } } static bool needsDestructMethod(ObjCImplementationDecl *impl) { const ObjCInterfaceDecl *iface = impl->getClassInterface(); for (const ObjCIvarDecl *ivar = iface->all_declared_ivar_begin(); ivar; ivar = ivar->getNextIvar()) if (ivar->getType().isDestructedType()) return true; return false; } static bool AllTrivialInitializers(CodeGenModule &CGM, ObjCImplementationDecl *D) { CodeGenFunction CGF(CGM); for (ObjCImplementationDecl::init_iterator B = D->init_begin(), E = D->init_end(); B != E; ++B) { CXXCtorInitializer *CtorInitExp = *B; Expr *Init = CtorInitExp->getInit(); if (!CGF.isTrivialInitializer(Init)) return false; } return true; } /// EmitObjCIvarInitializations - Emit information for ivar initialization /// for an implementation. void CodeGenModule::EmitObjCIvarInitializations(ObjCImplementationDecl *D) { // We might need a .cxx_destruct even if we don't have any ivar initializers. if (needsDestructMethod(D)) { IdentifierInfo *II = &getContext().Idents.get(".cxx_destruct"); Selector cxxSelector = getContext().Selectors.getSelector(0, &II); ObjCMethodDecl *DTORMethod = ObjCMethodDecl::Create( getContext(), D->getLocation(), D->getLocation(), cxxSelector, getContext().VoidTy, nullptr, D, /*isInstance=*/true, /*isVariadic=*/false, /*isPropertyAccessor=*/true, /*isSynthesizedAccessorStub=*/false, /*isImplicitlyDeclared=*/true, /*isDefined=*/false, ObjCMethodDecl::Required); D->addInstanceMethod(DTORMethod); CodeGenFunction(*this).GenerateObjCCtorDtorMethod(D, DTORMethod, false); D->setHasDestructors(true); } // If the implementation doesn't have any ivar initializers, we don't need // a .cxx_construct. if (D->getNumIvarInitializers() == 0 || AllTrivialInitializers(*this, D)) return; IdentifierInfo *II = &getContext().Idents.get(".cxx_construct"); Selector cxxSelector = getContext().Selectors.getSelector(0, &II); // The constructor returns 'self'. ObjCMethodDecl *CTORMethod = ObjCMethodDecl::Create( getContext(), D->getLocation(), D->getLocation(), cxxSelector, getContext().getObjCIdType(), nullptr, D, /*isInstance=*/true, /*isVariadic=*/false, /*isPropertyAccessor=*/true, /*isSynthesizedAccessorStub=*/false, /*isImplicitlyDeclared=*/true, /*isDefined=*/false, ObjCMethodDecl::Required); D->addInstanceMethod(CTORMethod); CodeGenFunction(*this).GenerateObjCCtorDtorMethod(D, CTORMethod, true); D->setHasNonZeroConstructors(true); } // EmitLinkageSpec - Emit all declarations in a linkage spec. void CodeGenModule::EmitLinkageSpec(const LinkageSpecDecl *LSD) { if (LSD->getLanguage() != LinkageSpecDecl::lang_c && LSD->getLanguage() != LinkageSpecDecl::lang_cxx) { ErrorUnsupported(LSD, "linkage spec"); return; } EmitDeclContext(LSD); } void CodeGenModule::EmitTopLevelStmt(const TopLevelStmtDecl *D) { // Device code should not be at top level. if (LangOpts.CUDA && LangOpts.CUDAIsDevice) return; std::unique_ptr &CurCGF = GlobalTopLevelStmtBlockInFlight.first; // We emitted a top-level stmt but after it there is initialization. // Stop squashing the top-level stmts into a single function. if (CurCGF && CXXGlobalInits.back() != CurCGF->CurFn) { CurCGF->FinishFunction(D->getEndLoc()); CurCGF = nullptr; } if (!CurCGF) { // void __stmts__N(void) // FIXME: Ask the ABI name mangler to pick a name. std::string Name = "__stmts__" + llvm::utostr(CXXGlobalInits.size()); FunctionArgList Args; QualType RetTy = getContext().VoidTy; const CGFunctionInfo &FnInfo = getTypes().arrangeBuiltinFunctionDeclaration(RetTy, Args); llvm::FunctionType *FnTy = getTypes().GetFunctionType(FnInfo); llvm::Function *Fn = llvm::Function::Create( FnTy, llvm::GlobalValue::InternalLinkage, Name, &getModule()); CurCGF.reset(new CodeGenFunction(*this)); GlobalTopLevelStmtBlockInFlight.second = D; CurCGF->StartFunction(GlobalDecl(), RetTy, Fn, FnInfo, Args, D->getBeginLoc(), D->getBeginLoc()); CXXGlobalInits.push_back(Fn); } CurCGF->EmitStmt(D->getStmt()); } void CodeGenModule::EmitDeclContext(const DeclContext *DC) { for (auto *I : DC->decls()) { // Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope // are themselves considered "top-level", so EmitTopLevelDecl on an // ObjCImplDecl does not recursively visit them. We need to do that in // case they're nested inside another construct (LinkageSpecDecl / // ExportDecl) that does stop them from being considered "top-level". if (auto *OID = dyn_cast(I)) { for (auto *M : OID->methods()) EmitTopLevelDecl(M); } EmitTopLevelDecl(I); } } /// EmitTopLevelDecl - Emit code for a single top level declaration. void CodeGenModule::EmitTopLevelDecl(Decl *D) { // Ignore dependent declarations. if (D->isTemplated()) return; // Consteval function shouldn't be emitted. if (auto *FD = dyn_cast(D); FD && FD->isImmediateFunction()) return; switch (D->getKind()) { case Decl::CXXConversion: case Decl::CXXMethod: case Decl::Function: EmitGlobal(cast(D)); // Always provide some coverage mapping // even for the functions that aren't emitted. AddDeferredUnusedCoverageMapping(D); break; case Decl::CXXDeductionGuide: // Function-like, but does not result in code emission. break; case Decl::Var: case Decl::Decomposition: case Decl::VarTemplateSpecialization: EmitGlobal(cast(D)); if (auto *DD = dyn_cast(D)) for (auto *B : DD->bindings()) if (auto *HD = B->getHoldingVar()) EmitGlobal(HD); break; // Indirect fields from global anonymous structs and unions can be // ignored; only the actual variable requires IR gen support. case Decl::IndirectField: break; // C++ Decls case Decl::Namespace: EmitDeclContext(cast(D)); break; case Decl::ClassTemplateSpecialization: { const auto *Spec = cast(D); if (CGDebugInfo *DI = getModuleDebugInfo()) if (Spec->getSpecializationKind() == TSK_ExplicitInstantiationDefinition && Spec->hasDefinition()) DI->completeTemplateDefinition(*Spec); } [[fallthrough]]; case Decl::CXXRecord: { CXXRecordDecl *CRD = cast(D); if (CGDebugInfo *DI = getModuleDebugInfo()) { if (CRD->hasDefinition()) DI->EmitAndRetainType(getContext().getRecordType(cast(D))); if (auto *ES = D->getASTContext().getExternalSource()) if (ES->hasExternalDefinitions(D) == ExternalASTSource::EK_Never) DI->completeUnusedClass(*CRD); } // Emit any static data members, they may be definitions. for (auto *I : CRD->decls()) if (isa(I) || isa(I)) EmitTopLevelDecl(I); break; } // No code generation needed. case Decl::UsingShadow: case Decl::ClassTemplate: case Decl::VarTemplate: case Decl::Concept: case Decl::VarTemplatePartialSpecialization: case Decl::FunctionTemplate: case Decl::TypeAliasTemplate: case Decl::Block: case Decl::Empty: case Decl::Binding: break; case Decl::Using: // using X; [C++] if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitUsingDecl(cast(*D)); break; case Decl::UsingEnum: // using enum X; [C++] if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitUsingEnumDecl(cast(*D)); break; case Decl::NamespaceAlias: if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitNamespaceAlias(cast(*D)); break; case Decl::UsingDirective: // using namespace X; [C++] if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitUsingDirective(cast(*D)); break; case Decl::CXXConstructor: getCXXABI().EmitCXXConstructors(cast(D)); break; case Decl::CXXDestructor: getCXXABI().EmitCXXDestructors(cast(D)); break; case Decl::StaticAssert: // Nothing to do. break; // Objective-C Decls // Forward declarations, no (immediate) code generation. case Decl::ObjCInterface: case Decl::ObjCCategory: break; case Decl::ObjCProtocol: { auto *Proto = cast(D); if (Proto->isThisDeclarationADefinition()) ObjCRuntime->GenerateProtocol(Proto); break; } case Decl::ObjCCategoryImpl: // Categories have properties but don't support synthesize so we // can ignore them here. ObjCRuntime->GenerateCategory(cast(D)); break; case Decl::ObjCImplementation: { auto *OMD = cast(D); EmitObjCPropertyImplementations(OMD); EmitObjCIvarInitializations(OMD); ObjCRuntime->GenerateClass(OMD); // Emit global variable debug information. if (CGDebugInfo *DI = getModuleDebugInfo()) if (getCodeGenOpts().hasReducedDebugInfo()) DI->getOrCreateInterfaceType(getContext().getObjCInterfaceType( OMD->getClassInterface()), OMD->getLocation()); break; } case Decl::ObjCMethod: { auto *OMD = cast(D); // If this is not a prototype, emit the body. if (OMD->getBody()) CodeGenFunction(*this).GenerateObjCMethod(OMD); break; } case Decl::ObjCCompatibleAlias: ObjCRuntime->RegisterAlias(cast(D)); break; case Decl::PragmaComment: { const auto *PCD = cast(D); switch (PCD->getCommentKind()) { case PCK_Unknown: llvm_unreachable("unexpected pragma comment kind"); case PCK_Linker: AppendLinkerOptions(PCD->getArg()); break; case PCK_Lib: AddDependentLib(PCD->getArg()); break; case PCK_Compiler: case PCK_ExeStr: case PCK_User: break; // We ignore all of these. } break; } case Decl::PragmaDetectMismatch: { const auto *PDMD = cast(D); AddDetectMismatch(PDMD->getName(), PDMD->getValue()); break; } case Decl::LinkageSpec: EmitLinkageSpec(cast(D)); break; case Decl::FileScopeAsm: { // File-scope asm is ignored during device-side CUDA compilation. if (LangOpts.CUDA && LangOpts.CUDAIsDevice) break; // File-scope asm is ignored during device-side OpenMP compilation. if (LangOpts.OpenMPIsTargetDevice) break; // File-scope asm is ignored during device-side SYCL compilation. if (LangOpts.SYCLIsDevice) break; auto *AD = cast(D); getModule().appendModuleInlineAsm(AD->getAsmString()->getString()); break; } case Decl::TopLevelStmt: EmitTopLevelStmt(cast(D)); break; case Decl::Import: { auto *Import = cast(D); // If we've already imported this module, we're done. if (!ImportedModules.insert(Import->getImportedModule())) break; // Emit debug information for direct imports. if (!Import->getImportedOwningModule()) { if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitImportDecl(*Import); } // For C++ standard modules we are done - we will call the module // initializer for imported modules, and that will likewise call those for // any imports it has. if (CXX20ModuleInits && Import->getImportedOwningModule() && !Import->getImportedOwningModule()->isModuleMapModule()) break; // For clang C++ module map modules the initializers for sub-modules are // emitted here. // Find all of the submodules and emit the module initializers. llvm::SmallPtrSet Visited; SmallVector Stack; Visited.insert(Import->getImportedModule()); Stack.push_back(Import->getImportedModule()); while (!Stack.empty()) { clang::Module *Mod = Stack.pop_back_val(); if (!EmittedModuleInitializers.insert(Mod).second) continue; for (auto *D : Context.getModuleInitializers(Mod)) EmitTopLevelDecl(D); // Visit the submodules of this module. for (auto *Submodule : Mod->submodules()) { // Skip explicit children; they need to be explicitly imported to emit // the initializers. if (Submodule->IsExplicit) continue; if (Visited.insert(Submodule).second) Stack.push_back(Submodule); } } break; } case Decl::Export: EmitDeclContext(cast(D)); break; case Decl::OMPThreadPrivate: EmitOMPThreadPrivateDecl(cast(D)); break; case Decl::OMPAllocate: EmitOMPAllocateDecl(cast(D)); break; case Decl::OMPDeclareReduction: EmitOMPDeclareReduction(cast(D)); break; case Decl::OMPDeclareMapper: EmitOMPDeclareMapper(cast(D)); break; case Decl::OMPRequires: EmitOMPRequiresDecl(cast(D)); break; case Decl::Typedef: case Decl::TypeAlias: // using foo = bar; [C++11] if (CGDebugInfo *DI = getModuleDebugInfo()) DI->EmitAndRetainType( getContext().getTypedefType(cast(D))); break; case Decl::Record: if (CGDebugInfo *DI = getModuleDebugInfo()) if (cast(D)->getDefinition()) DI->EmitAndRetainType(getContext().getRecordType(cast(D))); break; case Decl::Enum: if (CGDebugInfo *DI = getModuleDebugInfo()) if (cast(D)->getDefinition()) DI->EmitAndRetainType(getContext().getEnumType(cast(D))); break; case Decl::HLSLBuffer: getHLSLRuntime().addBuffer(cast(D)); break; default: // Make sure we handled everything we should, every other kind is a // non-top-level decl. FIXME: Would be nice to have an isTopLevelDeclKind // function. Need to recode Decl::Kind to do that easily. assert(isa(D) && "Unsupported decl kind"); break; } } void CodeGenModule::AddDeferredUnusedCoverageMapping(Decl *D) { // Do we need to generate coverage mapping? if (!CodeGenOpts.CoverageMapping) return; switch (D->getKind()) { case Decl::CXXConversion: case Decl::CXXMethod: case Decl::Function: case Decl::ObjCMethod: case Decl::CXXConstructor: case Decl::CXXDestructor: { if (!cast(D)->doesThisDeclarationHaveABody()) break; SourceManager &SM = getContext().getSourceManager(); if (LimitedCoverage && SM.getMainFileID() != SM.getFileID(D->getBeginLoc())) break; auto I = DeferredEmptyCoverageMappingDecls.find(D); if (I == DeferredEmptyCoverageMappingDecls.end()) DeferredEmptyCoverageMappingDecls[D] = true; break; } default: break; }; } void CodeGenModule::ClearUnusedCoverageMapping(const Decl *D) { // Do we need to generate coverage mapping? if (!CodeGenOpts.CoverageMapping) return; if (const auto *Fn = dyn_cast(D)) { if (Fn->isTemplateInstantiation()) ClearUnusedCoverageMapping(Fn->getTemplateInstantiationPattern()); } auto I = DeferredEmptyCoverageMappingDecls.find(D); if (I == DeferredEmptyCoverageMappingDecls.end()) DeferredEmptyCoverageMappingDecls[D] = false; else I->second = false; } void CodeGenModule::EmitDeferredUnusedCoverageMappings() { // We call takeVector() here to avoid use-after-free. // FIXME: DeferredEmptyCoverageMappingDecls is getting mutated because // we deserialize function bodies to emit coverage info for them, and that // deserializes more declarations. How should we handle that case? for (const auto &Entry : DeferredEmptyCoverageMappingDecls.takeVector()) { if (!Entry.second) continue; const Decl *D = Entry.first; switch (D->getKind()) { case Decl::CXXConversion: case Decl::CXXMethod: case Decl::Function: case Decl::ObjCMethod: { CodeGenPGO PGO(*this); GlobalDecl GD(cast(D)); PGO.emitEmptyCounterMapping(D, getMangledName(GD), getFunctionLinkage(GD)); break; } case Decl::CXXConstructor: { CodeGenPGO PGO(*this); GlobalDecl GD(cast(D), Ctor_Base); PGO.emitEmptyCounterMapping(D, getMangledName(GD), getFunctionLinkage(GD)); break; } case Decl::CXXDestructor: { CodeGenPGO PGO(*this); GlobalDecl GD(cast(D), Dtor_Base); PGO.emitEmptyCounterMapping(D, getMangledName(GD), getFunctionLinkage(GD)); break; } default: break; }; } } void CodeGenModule::EmitMainVoidAlias() { // In order to transition away from "__original_main" gracefully, emit an // alias for "main" in the no-argument case so that libc can detect when // new-style no-argument main is in used. if (llvm::Function *F = getModule().getFunction("main")) { if (!F->isDeclaration() && F->arg_size() == 0 && !F->isVarArg() && F->getReturnType()->isIntegerTy(Context.getTargetInfo().getIntWidth())) { auto *GA = llvm::GlobalAlias::create("__main_void", F); GA->setVisibility(llvm::GlobalValue::HiddenVisibility); } } } /// Turns the given pointer into a constant. static llvm::Constant *GetPointerConstant(llvm::LLVMContext &Context, const void *Ptr) { uintptr_t PtrInt = reinterpret_cast(Ptr); llvm::Type *i64 = llvm::Type::getInt64Ty(Context); return llvm::ConstantInt::get(i64, PtrInt); } static void EmitGlobalDeclMetadata(CodeGenModule &CGM, llvm::NamedMDNode *&GlobalMetadata, GlobalDecl D, llvm::GlobalValue *Addr) { if (!GlobalMetadata) GlobalMetadata = CGM.getModule().getOrInsertNamedMetadata("clang.global.decl.ptrs"); // TODO: should we report variant information for ctors/dtors? llvm::Metadata *Ops[] = {llvm::ConstantAsMetadata::get(Addr), llvm::ConstantAsMetadata::get(GetPointerConstant( CGM.getLLVMContext(), D.getDecl()))}; GlobalMetadata->addOperand(llvm::MDNode::get(CGM.getLLVMContext(), Ops)); } bool CodeGenModule::CheckAndReplaceExternCIFuncs(llvm::GlobalValue *Elem, llvm::GlobalValue *CppFunc) { // Store the list of ifuncs we need to replace uses in. llvm::SmallVector IFuncs; // List of ConstantExprs that we should be able to delete when we're done // here. llvm::SmallVector CEs; // It isn't valid to replace the extern-C ifuncs if all we find is itself! if (Elem == CppFunc) return false; // First make sure that all users of this are ifuncs (or ifuncs via a // bitcast), and collect the list of ifuncs and CEs so we can work on them // later. for (llvm::User *User : Elem->users()) { // Users can either be a bitcast ConstExpr that is used by the ifuncs, OR an // ifunc directly. In any other case, just give up, as we don't know what we // could break by changing those. if (auto *ConstExpr = dyn_cast(User)) { if (ConstExpr->getOpcode() != llvm::Instruction::BitCast) return false; for (llvm::User *CEUser : ConstExpr->users()) { if (auto *IFunc = dyn_cast(CEUser)) { IFuncs.push_back(IFunc); } else { return false; } } CEs.push_back(ConstExpr); } else if (auto *IFunc = dyn_cast(User)) { IFuncs.push_back(IFunc); } else { // This user is one we don't know how to handle, so fail redirection. This // will result in an ifunc retaining a resolver name that will ultimately // fail to be resolved to a defined function. return false; } } // Now we know this is a valid case where we can do this alias replacement, we // need to remove all of the references to Elem (and the bitcasts!) so we can // delete it. for (llvm::GlobalIFunc *IFunc : IFuncs) IFunc->setResolver(nullptr); for (llvm::ConstantExpr *ConstExpr : CEs) ConstExpr->destroyConstant(); // We should now be out of uses for the 'old' version of this function, so we // can erase it as well. Elem->eraseFromParent(); for (llvm::GlobalIFunc *IFunc : IFuncs) { // The type of the resolver is always just a function-type that returns the // type of the IFunc, so create that here. If the type of the actual // resolver doesn't match, it just gets bitcast to the right thing. auto *ResolverTy = llvm::FunctionType::get(IFunc->getType(), /*isVarArg*/ false); llvm::Constant *Resolver = GetOrCreateLLVMFunction( CppFunc->getName(), ResolverTy, {}, /*ForVTable*/ false); IFunc->setResolver(Resolver); } return true; } /// For each function which is declared within an extern "C" region and marked /// as 'used', but has internal linkage, create an alias from the unmangled /// name to the mangled name if possible. People expect to be able to refer /// to such functions with an unmangled name from inline assembly within the /// same translation unit. void CodeGenModule::EmitStaticExternCAliases() { if (!getTargetCodeGenInfo().shouldEmitStaticExternCAliases()) return; for (auto &I : StaticExternCValues) { IdentifierInfo *Name = I.first; llvm::GlobalValue *Val = I.second; // If Val is null, that implies there were multiple declarations that each // had a claim to the unmangled name. In this case, generation of the alias // is suppressed. See CodeGenModule::MaybeHandleStaticInExternC. if (!Val) break; llvm::GlobalValue *ExistingElem = getModule().getNamedValue(Name->getName()); // If there is either not something already by this name, or we were able to // replace all uses from IFuncs, create the alias. if (!ExistingElem || CheckAndReplaceExternCIFuncs(ExistingElem, Val)) addCompilerUsedGlobal(llvm::GlobalAlias::create(Name->getName(), Val)); } } bool CodeGenModule::lookupRepresentativeDecl(StringRef MangledName, GlobalDecl &Result) const { auto Res = Manglings.find(MangledName); if (Res == Manglings.end()) return false; Result = Res->getValue(); return true; } /// Emits metadata nodes associating all the global values in the /// current module with the Decls they came from. This is useful for /// projects using IR gen as a subroutine. /// /// Since there's currently no way to associate an MDNode directly /// with an llvm::GlobalValue, we create a global named metadata /// with the name 'clang.global.decl.ptrs'. void CodeGenModule::EmitDeclMetadata() { llvm::NamedMDNode *GlobalMetadata = nullptr; for (auto &I : MangledDeclNames) { llvm::GlobalValue *Addr = getModule().getNamedValue(I.second); // Some mangled names don't necessarily have an associated GlobalValue // in this module, e.g. if we mangled it for DebugInfo. if (Addr) EmitGlobalDeclMetadata(*this, GlobalMetadata, I.first, Addr); } } /// Emits metadata nodes for all the local variables in the current /// function. void CodeGenFunction::EmitDeclMetadata() { if (LocalDeclMap.empty()) return; llvm::LLVMContext &Context = getLLVMContext(); // Find the unique metadata ID for this name. unsigned DeclPtrKind = Context.getMDKindID("clang.decl.ptr"); llvm::NamedMDNode *GlobalMetadata = nullptr; for (auto &I : LocalDeclMap) { const Decl *D = I.first; llvm::Value *Addr = I.second.getPointer(); if (auto *Alloca = dyn_cast(Addr)) { llvm::Value *DAddr = GetPointerConstant(getLLVMContext(), D); Alloca->setMetadata( DeclPtrKind, llvm::MDNode::get( Context, llvm::ValueAsMetadata::getConstant(DAddr))); } else if (auto *GV = dyn_cast(Addr)) { GlobalDecl GD = GlobalDecl(cast(D)); EmitGlobalDeclMetadata(CGM, GlobalMetadata, GD, GV); } } } void CodeGenModule::EmitVersionIdentMetadata() { llvm::NamedMDNode *IdentMetadata = TheModule.getOrInsertNamedMetadata("llvm.ident"); std::string Version = getClangFullVersion(); llvm::LLVMContext &Ctx = TheModule.getContext(); llvm::Metadata *IdentNode[] = {llvm::MDString::get(Ctx, Version)}; IdentMetadata->addOperand(llvm::MDNode::get(Ctx, IdentNode)); } void CodeGenModule::EmitCommandLineMetadata() { llvm::NamedMDNode *CommandLineMetadata = TheModule.getOrInsertNamedMetadata("llvm.commandline"); std::string CommandLine = getCodeGenOpts().RecordCommandLine; llvm::LLVMContext &Ctx = TheModule.getContext(); llvm::Metadata *CommandLineNode[] = {llvm::MDString::get(Ctx, CommandLine)}; CommandLineMetadata->addOperand(llvm::MDNode::get(Ctx, CommandLineNode)); } void CodeGenModule::EmitCoverageFile() { llvm::NamedMDNode *CUNode = TheModule.getNamedMetadata("llvm.dbg.cu"); if (!CUNode) return; llvm::NamedMDNode *GCov = TheModule.getOrInsertNamedMetadata("llvm.gcov"); llvm::LLVMContext &Ctx = TheModule.getContext(); auto *CoverageDataFile = llvm::MDString::get(Ctx, getCodeGenOpts().CoverageDataFile); auto *CoverageNotesFile = llvm::MDString::get(Ctx, getCodeGenOpts().CoverageNotesFile); for (int i = 0, e = CUNode->getNumOperands(); i != e; ++i) { llvm::MDNode *CU = CUNode->getOperand(i); llvm::Metadata *Elts[] = {CoverageNotesFile, CoverageDataFile, CU}; GCov->addOperand(llvm::MDNode::get(Ctx, Elts)); } } llvm::Constant *CodeGenModule::GetAddrOfRTTIDescriptor(QualType Ty, bool ForEH) { // Return a bogus pointer if RTTI is disabled, unless it's for EH. // FIXME: should we even be calling this method if RTTI is disabled // and it's not for EH? if (!shouldEmitRTTI(ForEH)) return llvm::Constant::getNullValue(GlobalsInt8PtrTy); if (ForEH && Ty->isObjCObjectPointerType() && LangOpts.ObjCRuntime.isGNUFamily()) return ObjCRuntime->GetEHType(Ty); return getCXXABI().getAddrOfRTTIDescriptor(Ty); } void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) { // Do not emit threadprivates in simd-only mode. if (LangOpts.OpenMP && LangOpts.OpenMPSimd) return; for (auto RefExpr : D->varlists()) { auto *VD = cast(cast(RefExpr)->getDecl()); bool PerformInit = VD->getAnyInitializer() && !VD->getAnyInitializer()->isConstantInitializer(getContext(), /*ForRef=*/false); Address Addr(GetAddrOfGlobalVar(VD), getTypes().ConvertTypeForMem(VD->getType()), getContext().getDeclAlign(VD)); if (auto InitFunction = getOpenMPRuntime().emitThreadPrivateVarDefinition( VD, Addr, RefExpr->getBeginLoc(), PerformInit)) CXXGlobalInits.push_back(InitFunction); } } llvm::Metadata * CodeGenModule::CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map, StringRef Suffix) { if (auto *FnType = T->getAs()) T = getContext().getFunctionType( FnType->getReturnType(), FnType->getParamTypes(), FnType->getExtProtoInfo().withExceptionSpec(EST_None)); llvm::Metadata *&InternalId = Map[T.getCanonicalType()]; if (InternalId) return InternalId; if (isExternallyVisible(T->getLinkage())) { std::string OutName; llvm::raw_string_ostream Out(OutName); getCXXABI().getMangleContext().mangleTypeName( T, Out, getCodeGenOpts().SanitizeCfiICallNormalizeIntegers); if (getCodeGenOpts().SanitizeCfiICallNormalizeIntegers) Out << ".normalized"; Out << Suffix; InternalId = llvm::MDString::get(getLLVMContext(), Out.str()); } else { InternalId = llvm::MDNode::getDistinct(getLLVMContext(), llvm::ArrayRef()); } return InternalId; } llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForType(QualType T) { return CreateMetadataIdentifierImpl(T, MetadataIdMap, ""); } llvm::Metadata * CodeGenModule::CreateMetadataIdentifierForVirtualMemPtrType(QualType T) { return CreateMetadataIdentifierImpl(T, VirtualMetadataIdMap, ".virtual"); } // Generalize pointer types to a void pointer with the qualifiers of the // originally pointed-to type, e.g. 'const char *' and 'char * const *' // generalize to 'const void *' while 'char *' and 'const char **' generalize to // 'void *'. static QualType GeneralizeType(ASTContext &Ctx, QualType Ty) { if (!Ty->isPointerType()) return Ty; return Ctx.getPointerType( QualType(Ctx.VoidTy).withCVRQualifiers( Ty->getPointeeType().getCVRQualifiers())); } // Apply type generalization to a FunctionType's return and argument types static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty) { if (auto *FnType = Ty->getAs()) { SmallVector GeneralizedParams; for (auto &Param : FnType->param_types()) GeneralizedParams.push_back(GeneralizeType(Ctx, Param)); return Ctx.getFunctionType( GeneralizeType(Ctx, FnType->getReturnType()), GeneralizedParams, FnType->getExtProtoInfo()); } if (auto *FnType = Ty->getAs()) return Ctx.getFunctionNoProtoType( GeneralizeType(Ctx, FnType->getReturnType())); llvm_unreachable("Encountered unknown FunctionType"); } llvm::Metadata *CodeGenModule::CreateMetadataIdentifierGeneralized(QualType T) { return CreateMetadataIdentifierImpl(GeneralizeFunctionType(getContext(), T), GeneralizedMetadataIdMap, ".generalized"); } /// Returns whether this module needs the "all-vtables" type identifier. bool CodeGenModule::NeedAllVtablesTypeId() const { // Returns true if at least one of vtable-based CFI checkers is enabled and // is not in the trapping mode. return ((LangOpts.Sanitize.has(SanitizerKind::CFIVCall) && !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIVCall)) || (LangOpts.Sanitize.has(SanitizerKind::CFINVCall) && !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFINVCall)) || (LangOpts.Sanitize.has(SanitizerKind::CFIDerivedCast) && !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIDerivedCast)) || (LangOpts.Sanitize.has(SanitizerKind::CFIUnrelatedCast) && !CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIUnrelatedCast))); } void CodeGenModule::AddVTableTypeMetadata(llvm::GlobalVariable *VTable, CharUnits Offset, const CXXRecordDecl *RD) { llvm::Metadata *MD = CreateMetadataIdentifierForType(QualType(RD->getTypeForDecl(), 0)); VTable->addTypeMetadata(Offset.getQuantity(), MD); if (CodeGenOpts.SanitizeCfiCrossDso) if (auto CrossDsoTypeId = CreateCrossDsoCfiTypeId(MD)) VTable->addTypeMetadata(Offset.getQuantity(), llvm::ConstantAsMetadata::get(CrossDsoTypeId)); if (NeedAllVtablesTypeId()) { llvm::Metadata *MD = llvm::MDString::get(getLLVMContext(), "all-vtables"); VTable->addTypeMetadata(Offset.getQuantity(), MD); } } llvm::SanitizerStatReport &CodeGenModule::getSanStats() { if (!SanStats) SanStats = std::make_unique(&getModule()); return *SanStats; } llvm::Value * CodeGenModule::createOpenCLIntToSamplerConversion(const Expr *E, CodeGenFunction &CGF) { llvm::Constant *C = ConstantEmitter(CGF).emitAbstract(E, E->getType()); auto *SamplerT = getOpenCLRuntime().getSamplerType(E->getType().getTypePtr()); auto *FTy = llvm::FunctionType::get(SamplerT, {C->getType()}, false); auto *Call = CGF.EmitRuntimeCall( CreateRuntimeFunction(FTy, "__translate_sampler_initializer"), {C}); return Call; } CharUnits CodeGenModule::getNaturalPointeeTypeAlignment( QualType T, LValueBaseInfo *BaseInfo, TBAAAccessInfo *TBAAInfo) { return getNaturalTypeAlignment(T->getPointeeType(), BaseInfo, TBAAInfo, /* forPointeeType= */ true); } CharUnits CodeGenModule::getNaturalTypeAlignment(QualType T, LValueBaseInfo *BaseInfo, TBAAAccessInfo *TBAAInfo, bool forPointeeType) { if (TBAAInfo) *TBAAInfo = getTBAAAccessInfo(T); // FIXME: This duplicates logic in ASTContext::getTypeAlignIfKnown. But // that doesn't return the information we need to compute BaseInfo. // Honor alignment typedef attributes even on incomplete types. // We also honor them straight for C++ class types, even as pointees; // there's an expressivity gap here. if (auto TT = T->getAs()) { if (auto Align = TT->getDecl()->getMaxAlignment()) { if (BaseInfo) *BaseInfo = LValueBaseInfo(AlignmentSource::AttributedType); return getContext().toCharUnitsFromBits(Align); } } bool AlignForArray = T->isArrayType(); // Analyze the base element type, so we don't get confused by incomplete // array types. T = getContext().getBaseElementType(T); if (T->isIncompleteType()) { // We could try to replicate the logic from // ASTContext::getTypeAlignIfKnown, but nothing uses the alignment if the // type is incomplete, so it's impossible to test. We could try to reuse // getTypeAlignIfKnown, but that doesn't return the information we need // to set BaseInfo. So just ignore the possibility that the alignment is // greater than one. if (BaseInfo) *BaseInfo = LValueBaseInfo(AlignmentSource::Type); return CharUnits::One(); } if (BaseInfo) *BaseInfo = LValueBaseInfo(AlignmentSource::Type); CharUnits Alignment; const CXXRecordDecl *RD; if (T.getQualifiers().hasUnaligned()) { Alignment = CharUnits::One(); } else if (forPointeeType && !AlignForArray && (RD = T->getAsCXXRecordDecl())) { // For C++ class pointees, we don't know whether we're pointing at a // base or a complete object, so we generally need to use the // non-virtual alignment. Alignment = getClassPointerAlignment(RD); } else { Alignment = getContext().getTypeAlignInChars(T); } // Cap to the global maximum type alignment unless the alignment // was somehow explicit on the type. if (unsigned MaxAlign = getLangOpts().MaxTypeAlign) { if (Alignment.getQuantity() > MaxAlign && !getContext().isAlignmentRequired(T)) Alignment = CharUnits::fromQuantity(MaxAlign); } return Alignment; } bool CodeGenModule::stopAutoInit() { unsigned StopAfter = getContext().getLangOpts().TrivialAutoVarInitStopAfter; if (StopAfter) { // This number is positive only when -ftrivial-auto-var-init-stop-after=* is // used if (NumAutoVarInit >= StopAfter) { return true; } if (!NumAutoVarInit) { unsigned DiagID = getDiags().getCustomDiagID( DiagnosticsEngine::Warning, "-ftrivial-auto-var-init-stop-after=%0 has been enabled to limit the " "number of times ftrivial-auto-var-init=%1 gets applied."); getDiags().Report(DiagID) << StopAfter << (getContext().getLangOpts().getTrivialAutoVarInit() == LangOptions::TrivialAutoVarInitKind::Zero ? "zero" : "pattern"); } ++NumAutoVarInit; } return false; } void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS, const Decl *D) const { // ptxas does not allow '.' in symbol names. On the other hand, HIP prefers // postfix beginning with '.' since the symbol name can be demangled. if (LangOpts.HIP) OS << (isa(D) ? ".static." : ".intern."); else OS << (isa(D) ? "__static__" : "__intern__"); // If the CUID is not specified we try to generate a unique postfix. if (getLangOpts().CUID.empty()) { SourceManager &SM = getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(D->getLocation()); assert(PLoc.isValid() && "Source location is expected to be valid."); // Get the hash of the user defined macros. llvm::MD5 Hash; llvm::MD5::MD5Result Result; for (const auto &Arg : PreprocessorOpts.Macros) Hash.update(Arg.first); Hash.final(Result); // Get the UniqueID for the file containing the decl. llvm::sys::fs::UniqueID ID; if (auto EC = llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID)) { PLoc = SM.getPresumedLoc(D->getLocation(), /*UseLineDirectives=*/false); assert(PLoc.isValid() && "Source location is expected to be valid."); if (auto EC = llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID)) SM.getDiagnostics().Report(diag::err_cannot_open_file) << PLoc.getFilename() << EC.message(); } OS << llvm::format("%x", ID.getFile()) << llvm::format("%x", ID.getDevice()) << "_" << llvm::utohexstr(Result.low(), /*LowerCase=*/true, /*Width=*/8); } else { OS << getContext().getCUIDHash(); } } void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) { assert(DeferredDeclsToEmit.empty() && "Should have emitted all decls deferred to emit."); assert(NewBuilder->DeferredDecls.empty() && "Newly created module should not have deferred decls"); NewBuilder->DeferredDecls = std::move(DeferredDecls); assert(NewBuilder->DeferredVTables.empty() && "Newly created module should not have deferred vtables"); NewBuilder->DeferredVTables = std::move(DeferredVTables); assert(NewBuilder->MangledDeclNames.empty() && "Newly created module should not have mangled decl names"); assert(NewBuilder->Manglings.empty() && "Newly created module should not have manglings"); NewBuilder->Manglings = std::move(Manglings); NewBuilder->WeakRefReferences = std::move(WeakRefReferences); NewBuilder->TBAA = std::move(TBAA); assert(NewBuilder->EmittedDeferredDecls.empty() && "Still have (unmerged) EmittedDeferredDecls deferred decls"); NewBuilder->EmittedDeferredDecls = std::move(EmittedDeferredDecls); NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx); } diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index f6ea4d0b4366..bdbdad9362e1 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1,6630 +1,6636 @@ //===--- Driver.cpp - Clang GCC Compatible Driver -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "clang/Driver/Driver.h" #include "ToolChains/AIX.h" #include "ToolChains/AMDGPU.h" #include "ToolChains/AMDGPUOpenMP.h" #include "ToolChains/AVR.h" #include "ToolChains/Ananas.h" #include "ToolChains/Arch/RISCV.h" #include "ToolChains/BareMetal.h" #include "ToolChains/CSKYToolChain.h" #include "ToolChains/Clang.h" #include "ToolChains/CloudABI.h" #include "ToolChains/Contiki.h" #include "ToolChains/CrossWindows.h" #include "ToolChains/Cuda.h" #include "ToolChains/Darwin.h" #include "ToolChains/DragonFly.h" #include "ToolChains/FreeBSD.h" #include "ToolChains/Fuchsia.h" #include "ToolChains/Gnu.h" #include "ToolChains/HIPAMD.h" #include "ToolChains/HIPSPV.h" #include "ToolChains/HLSL.h" #include "ToolChains/Haiku.h" #include "ToolChains/Hexagon.h" #include "ToolChains/Hurd.h" #include "ToolChains/Lanai.h" #include "ToolChains/Linux.h" #include "ToolChains/MSP430.h" #include "ToolChains/MSVC.h" #include "ToolChains/MinGW.h" #include "ToolChains/Minix.h" #include "ToolChains/MipsLinux.h" #include "ToolChains/Myriad.h" #include "ToolChains/NaCl.h" #include "ToolChains/NetBSD.h" #include "ToolChains/OHOS.h" #include "ToolChains/OpenBSD.h" #include "ToolChains/PPCFreeBSD.h" #include "ToolChains/PPCLinux.h" #include "ToolChains/PS4CPU.h" #include "ToolChains/RISCVToolchain.h" #include "ToolChains/SPIRV.h" #include "ToolChains/Solaris.h" #include "ToolChains/TCE.h" #include "ToolChains/VEToolchain.h" #include "ToolChains/WebAssembly.h" #include "ToolChains/XCore.h" #include "ToolChains/ZOS.h" #include "clang/Basic/TargetID.h" #include "clang/Basic/Version.h" #include "clang/Config/config.h" #include "clang/Driver/Action.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" #include "clang/Driver/Options.h" #include "clang/Driver/Phases.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Types.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Config/llvm-config.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/OptSpecifier.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ExitCodes.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MD5.h" #include "llvm/Support/Path.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/StringSaver.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Host.h" #include // ::getenv #include #include #include #include #include #if LLVM_ON_UNIX #include // getpid #endif using namespace clang::driver; using namespace clang; using namespace llvm::opt; static std::optional getOffloadTargetTriple(const Driver &D, const ArgList &Args) { auto OffloadTargets = Args.getAllArgValues(options::OPT_offload_EQ); // Offload compilation flow does not support multiple targets for now. We // need the HIPActionBuilder (and possibly the CudaActionBuilder{,Base}too) // to support multiple tool chains first. switch (OffloadTargets.size()) { default: D.Diag(diag::err_drv_only_one_offload_target_supported); return std::nullopt; case 0: D.Diag(diag::err_drv_invalid_or_unsupported_offload_target) << ""; return std::nullopt; case 1: break; } return llvm::Triple(OffloadTargets[0]); } static std::optional getNVIDIAOffloadTargetTriple(const Driver &D, const ArgList &Args, const llvm::Triple &HostTriple) { if (!Args.hasArg(options::OPT_offload_EQ)) { return llvm::Triple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda"); } auto TT = getOffloadTargetTriple(D, Args); if (TT && (TT->getArch() == llvm::Triple::spirv32 || TT->getArch() == llvm::Triple::spirv64)) { if (Args.hasArg(options::OPT_emit_llvm)) return TT; D.Diag(diag::err_drv_cuda_offload_only_emit_bc); return std::nullopt; } D.Diag(diag::err_drv_invalid_or_unsupported_offload_target) << TT->str(); return std::nullopt; } static std::optional getHIPOffloadTargetTriple(const Driver &D, const ArgList &Args) { if (!Args.hasArg(options::OPT_offload_EQ)) { return llvm::Triple("amdgcn-amd-amdhsa"); // Default HIP triple. } auto TT = getOffloadTargetTriple(D, Args); if (!TT) return std::nullopt; if (TT->getArch() == llvm::Triple::amdgcn && TT->getVendor() == llvm::Triple::AMD && TT->getOS() == llvm::Triple::AMDHSA) return TT; if (TT->getArch() == llvm::Triple::spirv64) return TT; D.Diag(diag::err_drv_invalid_or_unsupported_offload_target) << TT->str(); return std::nullopt; } // static std::string Driver::GetResourcesPath(StringRef BinaryPath, StringRef CustomResourceDir) { // Since the resource directory is embedded in the module hash, it's important // that all places that need it call this function, so that they get the // exact same string ("a/../b/" and "b/" get different hashes, for example). // Dir is bin/ or lib/, depending on where BinaryPath is. std::string Dir = std::string(llvm::sys::path::parent_path(BinaryPath)); SmallString<128> P(Dir); if (CustomResourceDir != "") { llvm::sys::path::append(P, CustomResourceDir); } else { // On Windows, libclang.dll is in bin/. // On non-Windows, libclang.so/.dylib is in lib/. // With a static-library build of libclang, LibClangPath will contain the // path of the embedding binary, which for LLVM binaries will be in bin/. // ../lib gets us to lib/ in both cases. P = llvm::sys::path::parent_path(Dir); // This search path is also created in the COFF driver of lld, so any // changes here also needs to happen in lld/COFF/Driver.cpp llvm::sys::path::append(P, CLANG_INSTALL_LIBDIR_BASENAME, "clang", CLANG_VERSION_MAJOR_STRING); } return std::string(P.str()); } Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple, DiagnosticsEngine &Diags, std::string Title, IntrusiveRefCntPtr VFS) : Diags(Diags), VFS(std::move(VFS)), Mode(GCCMode), SaveTemps(SaveTempsNone), BitcodeEmbed(EmbedNone), Offload(OffloadHostDevice), CXX20HeaderType(HeaderMode_None), ModulesModeCXX20(false), LTOMode(LTOK_None), ClangExecutable(ClangExecutable), SysRoot(DEFAULT_SYSROOT), DriverTitle(Title), CCCPrintBindings(false), CCPrintOptions(false), CCLogDiagnostics(false), CCGenDiagnostics(false), CCPrintProcessStats(false), CCPrintInternalStats(false), TargetTriple(TargetTriple), Saver(Alloc), PrependArg(nullptr), CheckInputsExist(true), ProbePrecompiled(true), SuppressMissingInputWarning(false) { // Provide a sane fallback if no VFS is specified. if (!this->VFS) this->VFS = llvm::vfs::getRealFileSystem(); Name = std::string(llvm::sys::path::filename(ClangExecutable)); Dir = std::string(llvm::sys::path::parent_path(ClangExecutable)); InstalledDir = Dir; // Provide a sensible default installed dir. if ((!SysRoot.empty()) && llvm::sys::path::is_relative(SysRoot)) { // Prepend InstalledDir if SysRoot is relative SmallString<128> P(InstalledDir); llvm::sys::path::append(P, SysRoot); SysRoot = std::string(P); } #if defined(CLANG_CONFIG_FILE_SYSTEM_DIR) SystemConfigDir = CLANG_CONFIG_FILE_SYSTEM_DIR; #endif #if defined(CLANG_CONFIG_FILE_USER_DIR) { SmallString<128> P; llvm::sys::fs::expand_tilde(CLANG_CONFIG_FILE_USER_DIR, P); UserConfigDir = static_cast(P); } #endif // Compute the path to the resource directory. ResourceDir = GetResourcesPath(ClangExecutable, CLANG_RESOURCE_DIR); } void Driver::setDriverMode(StringRef Value) { static const std::string OptName = getOpts().getOption(options::OPT_driver_mode).getPrefixedName(); if (auto M = llvm::StringSwitch>(Value) .Case("gcc", GCCMode) .Case("g++", GXXMode) .Case("cpp", CPPMode) .Case("cl", CLMode) .Case("flang", FlangMode) .Case("dxc", DXCMode) .Default(std::nullopt)) Mode = *M; else Diag(diag::err_drv_unsupported_option_argument) << OptName << Value; } InputArgList Driver::ParseArgStrings(ArrayRef ArgStrings, bool IsClCompatMode, bool &ContainsError) { llvm::PrettyStackTraceString CrashInfo("Command line argument parsing"); ContainsError = false; unsigned IncludedFlagsBitmask; unsigned ExcludedFlagsBitmask; std::tie(IncludedFlagsBitmask, ExcludedFlagsBitmask) = getIncludeExcludeOptionFlagMasks(IsClCompatMode); // Make sure that Flang-only options don't pollute the Clang output // TODO: Make sure that Clang-only options don't pollute Flang output if (!IsFlangMode()) ExcludedFlagsBitmask |= options::FlangOnlyOption; unsigned MissingArgIndex, MissingArgCount; InputArgList Args = getOpts().ParseArgs(ArgStrings, MissingArgIndex, MissingArgCount, IncludedFlagsBitmask, ExcludedFlagsBitmask); // Check for missing argument error. if (MissingArgCount) { Diag(diag::err_drv_missing_argument) << Args.getArgString(MissingArgIndex) << MissingArgCount; ContainsError |= Diags.getDiagnosticLevel(diag::err_drv_missing_argument, SourceLocation()) > DiagnosticsEngine::Warning; } // Check for unsupported options. for (const Arg *A : Args) { if (A->getOption().hasFlag(options::Unsupported)) { unsigned DiagID; auto ArgString = A->getAsString(Args); std::string Nearest; if (getOpts().findNearest( ArgString, Nearest, IncludedFlagsBitmask, ExcludedFlagsBitmask | options::Unsupported) > 1) { DiagID = diag::err_drv_unsupported_opt; Diag(DiagID) << ArgString; } else { DiagID = diag::err_drv_unsupported_opt_with_suggestion; Diag(DiagID) << ArgString << Nearest; } ContainsError |= Diags.getDiagnosticLevel(DiagID, SourceLocation()) > DiagnosticsEngine::Warning; continue; } // Warn about -mcpu= without an argument. if (A->getOption().matches(options::OPT_mcpu_EQ) && A->containsValue("")) { Diag(diag::warn_drv_empty_joined_argument) << A->getAsString(Args); ContainsError |= Diags.getDiagnosticLevel( diag::warn_drv_empty_joined_argument, SourceLocation()) > DiagnosticsEngine::Warning; } } for (const Arg *A : Args.filtered(options::OPT_UNKNOWN)) { unsigned DiagID; auto ArgString = A->getAsString(Args); std::string Nearest; if (getOpts().findNearest(ArgString, Nearest, IncludedFlagsBitmask, ExcludedFlagsBitmask) > 1) { if (!IsCLMode() && getOpts().findExact(ArgString, Nearest, options::CC1Option)) { DiagID = diag::err_drv_unknown_argument_with_suggestion; Diags.Report(DiagID) << ArgString << "-Xclang " + Nearest; } else { DiagID = IsCLMode() ? diag::warn_drv_unknown_argument_clang_cl : diag::err_drv_unknown_argument; Diags.Report(DiagID) << ArgString; } } else { DiagID = IsCLMode() ? diag::warn_drv_unknown_argument_clang_cl_with_suggestion : diag::err_drv_unknown_argument_with_suggestion; Diags.Report(DiagID) << ArgString << Nearest; } ContainsError |= Diags.getDiagnosticLevel(DiagID, SourceLocation()) > DiagnosticsEngine::Warning; } for (const Arg *A : Args.filtered(options::OPT_o)) { if (ArgStrings[A->getIndex()] == A->getSpelling()) continue; // Warn on joined arguments that are similar to a long argument. std::string ArgString = ArgStrings[A->getIndex()]; std::string Nearest; if (getOpts().findExact("-" + ArgString, Nearest, IncludedFlagsBitmask, ExcludedFlagsBitmask)) Diags.Report(diag::warn_drv_potentially_misspelled_joined_argument) << A->getAsString(Args) << Nearest; } return Args; } // Determine which compilation mode we are in. We look for options which // affect the phase, starting with the earliest phases, and record which // option we used to determine the final phase. phases::ID Driver::getFinalPhase(const DerivedArgList &DAL, Arg **FinalPhaseArg) const { Arg *PhaseArg = nullptr; phases::ID FinalPhase; // -{E,EP,P,M,MM} only run the preprocessor. if (CCCIsCPP() || (PhaseArg = DAL.getLastArg(options::OPT_E)) || (PhaseArg = DAL.getLastArg(options::OPT__SLASH_EP)) || (PhaseArg = DAL.getLastArg(options::OPT_M, options::OPT_MM)) || (PhaseArg = DAL.getLastArg(options::OPT__SLASH_P)) || CCGenDiagnostics) { FinalPhase = phases::Preprocess; // --precompile only runs up to precompilation. // Options that cause the output of C++20 compiled module interfaces or // header units have the same effect. } else if ((PhaseArg = DAL.getLastArg(options::OPT__precompile)) || (PhaseArg = DAL.getLastArg(options::OPT_extract_api)) || (PhaseArg = DAL.getLastArg(options::OPT_fmodule_header, options::OPT_fmodule_header_EQ))) { FinalPhase = phases::Precompile; // -{fsyntax-only,-analyze,emit-ast} only run up to the compiler. } else if ((PhaseArg = DAL.getLastArg(options::OPT_fsyntax_only)) || (PhaseArg = DAL.getLastArg(options::OPT_print_supported_cpus)) || (PhaseArg = DAL.getLastArg(options::OPT_module_file_info)) || (PhaseArg = DAL.getLastArg(options::OPT_verify_pch)) || (PhaseArg = DAL.getLastArg(options::OPT_rewrite_objc)) || (PhaseArg = DAL.getLastArg(options::OPT_rewrite_legacy_objc)) || (PhaseArg = DAL.getLastArg(options::OPT__migrate)) || (PhaseArg = DAL.getLastArg(options::OPT__analyze)) || (PhaseArg = DAL.getLastArg(options::OPT_emit_ast))) { FinalPhase = phases::Compile; // -S only runs up to the backend. } else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) { FinalPhase = phases::Backend; // -c compilation only runs up to the assembler. } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) { FinalPhase = phases::Assemble; } else if ((PhaseArg = DAL.getLastArg(options::OPT_emit_interface_stubs))) { FinalPhase = phases::IfsMerge; // Otherwise do everything. } else FinalPhase = phases::Link; if (FinalPhaseArg) *FinalPhaseArg = PhaseArg; return FinalPhase; } static Arg *MakeInputArg(DerivedArgList &Args, const OptTable &Opts, StringRef Value, bool Claim = true) { Arg *A = new Arg(Opts.getOption(options::OPT_INPUT), Value, Args.getBaseArgs().MakeIndex(Value), Value.data()); Args.AddSynthesizedArg(A); if (Claim) A->claim(); return A; } DerivedArgList *Driver::TranslateInputArgs(const InputArgList &Args) const { const llvm::opt::OptTable &Opts = getOpts(); DerivedArgList *DAL = new DerivedArgList(Args); bool HasNostdlib = Args.hasArg(options::OPT_nostdlib); bool HasNostdlibxx = Args.hasArg(options::OPT_nostdlibxx); bool HasNodefaultlib = Args.hasArg(options::OPT_nodefaultlibs); bool IgnoreUnused = false; for (Arg *A : Args) { if (IgnoreUnused) A->claim(); if (A->getOption().matches(options::OPT_start_no_unused_arguments)) { IgnoreUnused = true; continue; } if (A->getOption().matches(options::OPT_end_no_unused_arguments)) { IgnoreUnused = false; continue; } // Unfortunately, we have to parse some forwarding options (-Xassembler, // -Xlinker, -Xpreprocessor) because we either integrate their functionality // (assembler and preprocessor), or bypass a previous driver ('collect2'). // Rewrite linker options, to replace --no-demangle with a custom internal // option. if ((A->getOption().matches(options::OPT_Wl_COMMA) || A->getOption().matches(options::OPT_Xlinker)) && A->containsValue("--no-demangle")) { // Add the rewritten no-demangle argument. DAL->AddFlagArg(A, Opts.getOption(options::OPT_Z_Xlinker__no_demangle)); // Add the remaining values as Xlinker arguments. for (StringRef Val : A->getValues()) if (Val != "--no-demangle") DAL->AddSeparateArg(A, Opts.getOption(options::OPT_Xlinker), Val); continue; } // Rewrite preprocessor options, to replace -Wp,-MD,FOO which is used by // some build systems. We don't try to be complete here because we don't // care to encourage this usage model. if (A->getOption().matches(options::OPT_Wp_COMMA) && (A->getValue(0) == StringRef("-MD") || A->getValue(0) == StringRef("-MMD"))) { // Rewrite to -MD/-MMD along with -MF. if (A->getValue(0) == StringRef("-MD")) DAL->AddFlagArg(A, Opts.getOption(options::OPT_MD)); else DAL->AddFlagArg(A, Opts.getOption(options::OPT_MMD)); if (A->getNumValues() == 2) DAL->AddSeparateArg(A, Opts.getOption(options::OPT_MF), A->getValue(1)); continue; } // Rewrite reserved library names. if (A->getOption().matches(options::OPT_l)) { StringRef Value = A->getValue(); // Rewrite unless -nostdlib is present. if (!HasNostdlib && !HasNodefaultlib && !HasNostdlibxx && Value == "stdc++") { DAL->AddFlagArg(A, Opts.getOption(options::OPT_Z_reserved_lib_stdcxx)); continue; } // Rewrite unconditionally. if (Value == "cc_kext") { DAL->AddFlagArg(A, Opts.getOption(options::OPT_Z_reserved_lib_cckext)); continue; } } // Pick up inputs via the -- option. if (A->getOption().matches(options::OPT__DASH_DASH)) { A->claim(); for (StringRef Val : A->getValues()) DAL->append(MakeInputArg(*DAL, Opts, Val, false)); continue; } DAL->append(A); } // Enforce -static if -miamcu is present. if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false)) DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_static)); // Add a default value of -mlinker-version=, if one was given and the user // didn't specify one. #if defined(HOST_LINK_VERSION) if (!Args.hasArg(options::OPT_mlinker_version_EQ) && strlen(HOST_LINK_VERSION) > 0) { DAL->AddJoinedArg(0, Opts.getOption(options::OPT_mlinker_version_EQ), HOST_LINK_VERSION); DAL->getLastArg(options::OPT_mlinker_version_EQ)->claim(); } #endif return DAL; } /// Compute target triple from args. /// /// This routine provides the logic to compute a target triple from various /// args passed to the driver and the default triple string. static llvm::Triple computeTargetTriple(const Driver &D, StringRef TargetTriple, const ArgList &Args, StringRef DarwinArchName = "") { // FIXME: Already done in Compilation *Driver::BuildCompilation if (const Arg *A = Args.getLastArg(options::OPT_target)) TargetTriple = A->getValue(); llvm::Triple Target(llvm::Triple::normalize(TargetTriple)); // GNU/Hurd's triples should have been -hurd-gnu*, but were historically made // -gnu* only, and we can not change this, so we have to detect that case as // being the Hurd OS. if (TargetTriple.contains("-unknown-gnu") || TargetTriple.contains("-pc-gnu")) Target.setOSName("hurd"); // Handle Apple-specific options available here. if (Target.isOSBinFormatMachO()) { // If an explicit Darwin arch name is given, that trumps all. if (!DarwinArchName.empty()) { tools::darwin::setTripleTypeForMachOArchName(Target, DarwinArchName, Args); return Target; } // Handle the Darwin '-arch' flag. if (Arg *A = Args.getLastArg(options::OPT_arch)) { StringRef ArchName = A->getValue(); tools::darwin::setTripleTypeForMachOArchName(Target, ArchName, Args); } } // Handle pseudo-target flags '-mlittle-endian'/'-EL' and // '-mbig-endian'/'-EB'. if (Arg *A = Args.getLastArgNoClaim(options::OPT_mlittle_endian, options::OPT_mbig_endian)) { llvm::Triple T = A->getOption().matches(options::OPT_mlittle_endian) ? Target.getLittleEndianArchVariant() : Target.getBigEndianArchVariant(); if (T.getArch() != llvm::Triple::UnknownArch) { Target = std::move(T); Args.claimAllArgs(options::OPT_mlittle_endian, options::OPT_mbig_endian); } } // Skip further flag support on OSes which don't support '-m32' or '-m64'. if (Target.getArch() == llvm::Triple::tce || Target.getOS() == llvm::Triple::Minix) return Target; // On AIX, the env OBJECT_MODE may affect the resulting arch variant. if (Target.isOSAIX()) { if (std::optional ObjectModeValue = llvm::sys::Process::GetEnv("OBJECT_MODE")) { StringRef ObjectMode = *ObjectModeValue; llvm::Triple::ArchType AT = llvm::Triple::UnknownArch; if (ObjectMode.equals("64")) { AT = Target.get64BitArchVariant().getArch(); } else if (ObjectMode.equals("32")) { AT = Target.get32BitArchVariant().getArch(); } else { D.Diag(diag::err_drv_invalid_object_mode) << ObjectMode; } if (AT != llvm::Triple::UnknownArch && AT != Target.getArch()) Target.setArch(AT); } } // The `-maix[32|64]` flags are only valid for AIX targets. if (Arg *A = Args.getLastArgNoClaim(options::OPT_maix32, options::OPT_maix64); A && !Target.isOSAIX()) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getAsString(Args) << Target.str(); // Handle pseudo-target flags '-m64', '-mx32', '-m32' and '-m16'. Arg *A = Args.getLastArg(options::OPT_m64, options::OPT_mx32, options::OPT_m32, options::OPT_m16, options::OPT_maix32, options::OPT_maix64); if (A) { llvm::Triple::ArchType AT = llvm::Triple::UnknownArch; if (A->getOption().matches(options::OPT_m64) || A->getOption().matches(options::OPT_maix64)) { AT = Target.get64BitArchVariant().getArch(); if (Target.getEnvironment() == llvm::Triple::GNUX32) Target.setEnvironment(llvm::Triple::GNU); else if (Target.getEnvironment() == llvm::Triple::MuslX32) Target.setEnvironment(llvm::Triple::Musl); } else if (A->getOption().matches(options::OPT_mx32) && Target.get64BitArchVariant().getArch() == llvm::Triple::x86_64) { AT = llvm::Triple::x86_64; if (Target.getEnvironment() == llvm::Triple::Musl) Target.setEnvironment(llvm::Triple::MuslX32); else Target.setEnvironment(llvm::Triple::GNUX32); } else if (A->getOption().matches(options::OPT_m32) || A->getOption().matches(options::OPT_maix32)) { AT = Target.get32BitArchVariant().getArch(); if (Target.getEnvironment() == llvm::Triple::GNUX32) Target.setEnvironment(llvm::Triple::GNU); else if (Target.getEnvironment() == llvm::Triple::MuslX32) Target.setEnvironment(llvm::Triple::Musl); } else if (A->getOption().matches(options::OPT_m16) && Target.get32BitArchVariant().getArch() == llvm::Triple::x86) { AT = llvm::Triple::x86; Target.setEnvironment(llvm::Triple::CODE16); } if (AT != llvm::Triple::UnknownArch && AT != Target.getArch()) { Target.setArch(AT); if (Target.isWindowsGNUEnvironment()) toolchains::MinGW::fixTripleArch(D, Target, Args); } } // Handle -miamcu flag. if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false)) { if (Target.get32BitArchVariant().getArch() != llvm::Triple::x86) D.Diag(diag::err_drv_unsupported_opt_for_target) << "-miamcu" << Target.str(); if (A && !A->getOption().matches(options::OPT_m32)) D.Diag(diag::err_drv_argument_not_allowed_with) << "-miamcu" << A->getBaseArg().getAsString(Args); Target.setArch(llvm::Triple::x86); Target.setArchName("i586"); Target.setEnvironment(llvm::Triple::UnknownEnvironment); Target.setEnvironmentName(""); Target.setOS(llvm::Triple::ELFIAMCU); Target.setVendor(llvm::Triple::UnknownVendor); Target.setVendorName("intel"); } // If target is MIPS adjust the target triple // accordingly to provided ABI name. if (Target.isMIPS()) { if ((A = Args.getLastArg(options::OPT_mabi_EQ))) { StringRef ABIName = A->getValue(); if (ABIName == "32") { Target = Target.get32BitArchVariant(); if (Target.getEnvironment() == llvm::Triple::GNUABI64 || Target.getEnvironment() == llvm::Triple::GNUABIN32) Target.setEnvironment(llvm::Triple::GNU); } else if (ABIName == "n32") { Target = Target.get64BitArchVariant(); if (Target.getEnvironment() == llvm::Triple::GNU || Target.getEnvironment() == llvm::Triple::GNUABI64) Target.setEnvironment(llvm::Triple::GNUABIN32); } else if (ABIName == "64") { Target = Target.get64BitArchVariant(); if (Target.getEnvironment() == llvm::Triple::GNU || Target.getEnvironment() == llvm::Triple::GNUABIN32) Target.setEnvironment(llvm::Triple::GNUABI64); } } } // If target is RISC-V adjust the target triple according to // provided architecture name if (Target.isRISCV()) { if (Args.hasArg(options::OPT_march_EQ) || Args.hasArg(options::OPT_mcpu_EQ)) { StringRef ArchName = tools::riscv::getRISCVArch(Args, Target); if (ArchName.starts_with_insensitive("rv32")) Target.setArch(llvm::Triple::riscv32); else if (ArchName.starts_with_insensitive("rv64")) Target.setArch(llvm::Triple::riscv64); } } return Target; } // Parse the LTO options and record the type of LTO compilation // based on which -f(no-)?lto(=.*)? or -f(no-)?offload-lto(=.*)? // option occurs last. static driver::LTOKind parseLTOMode(Driver &D, const llvm::opt::ArgList &Args, OptSpecifier OptEq, OptSpecifier OptNeg) { if (!Args.hasFlag(OptEq, OptNeg, false)) return LTOK_None; const Arg *A = Args.getLastArg(OptEq); StringRef LTOName = A->getValue(); driver::LTOKind LTOMode = llvm::StringSwitch(LTOName) .Case("full", LTOK_Full) .Case("thin", LTOK_Thin) .Default(LTOK_Unknown); if (LTOMode == LTOK_Unknown) { D.Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << A->getValue(); return LTOK_None; } return LTOMode; } // Parse the LTO options. void Driver::setLTOMode(const llvm::opt::ArgList &Args) { LTOMode = parseLTOMode(*this, Args, options::OPT_flto_EQ, options::OPT_fno_lto); OffloadLTOMode = parseLTOMode(*this, Args, options::OPT_foffload_lto_EQ, options::OPT_fno_offload_lto); // Try to enable `-foffload-lto=full` if `-fopenmp-target-jit` is on. if (Args.hasFlag(options::OPT_fopenmp_target_jit, options::OPT_fno_openmp_target_jit, false)) { if (Arg *A = Args.getLastArg(options::OPT_foffload_lto_EQ, options::OPT_fno_offload_lto)) if (OffloadLTOMode != LTOK_Full) Diag(diag::err_drv_incompatible_options) << A->getSpelling() << "-fopenmp-target-jit"; OffloadLTOMode = LTOK_Full; } } /// Compute the desired OpenMP runtime from the flags provided. Driver::OpenMPRuntimeKind Driver::getOpenMPRuntime(const ArgList &Args) const { StringRef RuntimeName(CLANG_DEFAULT_OPENMP_RUNTIME); const Arg *A = Args.getLastArg(options::OPT_fopenmp_EQ); if (A) RuntimeName = A->getValue(); auto RT = llvm::StringSwitch(RuntimeName) .Case("libomp", OMPRT_OMP) .Case("libgomp", OMPRT_GOMP) .Case("libiomp5", OMPRT_IOMP5) .Default(OMPRT_Unknown); if (RT == OMPRT_Unknown) { if (A) Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << A->getValue(); else // FIXME: We could use a nicer diagnostic here. Diag(diag::err_drv_unsupported_opt) << "-fopenmp"; } return RT; } void Driver::CreateOffloadingDeviceToolChains(Compilation &C, InputList &Inputs) { // // CUDA/HIP // // We need to generate a CUDA/HIP toolchain if any of the inputs has a CUDA // or HIP type. However, mixed CUDA/HIP compilation is not supported. bool IsCuda = llvm::any_of(Inputs, [](std::pair &I) { return types::isCuda(I.first); }); bool IsHIP = llvm::any_of(Inputs, [](std::pair &I) { return types::isHIP(I.first); }) || C.getInputArgs().hasArg(options::OPT_hip_link); if (IsCuda && IsHIP) { Diag(clang::diag::err_drv_mix_cuda_hip); return; } if (IsCuda) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); const llvm::Triple &HostTriple = HostTC->getTriple(); auto OFK = Action::OFK_Cuda; auto CudaTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(), HostTriple); if (!CudaTriple) return; // Use the CUDA and host triples as the key into the ToolChains map, // because the device toolchain we create depends on both. auto &CudaTC = ToolChains[CudaTriple->str() + "/" + HostTriple.str()]; if (!CudaTC) { CudaTC = std::make_unique( *this, *CudaTriple, *HostTC, C.getInputArgs()); // Emit a warning if the detected CUDA version is too new. CudaInstallationDetector &CudaInstallation = static_cast(*CudaTC).CudaInstallation; if (CudaInstallation.isValid()) CudaInstallation.WarnIfUnsupportedVersion(); } C.addOffloadDeviceToolChain(CudaTC.get(), OFK); } else if (IsHIP) { if (auto *OMPTargetArg = C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { Diag(clang::diag::err_drv_unsupported_opt_for_language_mode) << OMPTargetArg->getSpelling() << "HIP"; return; } const ToolChain *HostTC = C.getSingleOffloadToolChain(); auto OFK = Action::OFK_HIP; auto HIPTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs()); if (!HIPTriple) return; auto *HIPTC = &getOffloadingDeviceToolChain(C.getInputArgs(), *HIPTriple, *HostTC, OFK); assert(HIPTC && "Could not create offloading device tool chain."); C.addOffloadDeviceToolChain(HIPTC, OFK); } // // OpenMP // // We need to generate an OpenMP toolchain if the user specified targets with // the -fopenmp-targets option or used --offload-arch with OpenMP enabled. bool IsOpenMPOffloading = C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, options::OPT_fno_openmp, false) && (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) || C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)); if (IsOpenMPOffloading) { // We expect that -fopenmp-targets is always used in conjunction with the // option -fopenmp specifying a valid runtime with offloading support, i.e. // libomp or libiomp. OpenMPRuntimeKind RuntimeKind = getOpenMPRuntime(C.getInputArgs()); if (RuntimeKind != OMPRT_OMP && RuntimeKind != OMPRT_IOMP5) { Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets); return; } llvm::StringMap> DerivedArchs; llvm::StringMap FoundNormalizedTriples; std::multiset OpenMPTriples; // If the user specified -fopenmp-targets= we create a toolchain for each // valid triple. Otherwise, if only --offload-arch= was specified we instead // attempt to derive the appropriate toolchains from the arguments. if (Arg *OpenMPTargets = C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { if (OpenMPTargets && !OpenMPTargets->getNumValues()) { Diag(clang::diag::warn_drv_empty_joined_argument) << OpenMPTargets->getAsString(C.getInputArgs()); return; } for (StringRef T : OpenMPTargets->getValues()) OpenMPTriples.insert(T); } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) && !IsHIP && !IsCuda) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs()); auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(), HostTC->getTriple()); // Attempt to deduce the offloading triple from the set of architectures. // We can only correctly deduce NVPTX / AMDGPU triples currently. We need // to temporarily create these toolchains so that we can access tools for // inferring architectures. llvm::DenseSet Archs; if (NVPTXTriple) { auto TempTC = std::make_unique( *this, *NVPTXTriple, *HostTC, C.getInputArgs()); for (StringRef Arch : getOffloadArchs( C, C.getArgs(), Action::OFK_OpenMP, &*TempTC, true)) Archs.insert(Arch); } if (AMDTriple) { auto TempTC = std::make_unique( *this, *AMDTriple, *HostTC, C.getInputArgs()); for (StringRef Arch : getOffloadArchs( C, C.getArgs(), Action::OFK_OpenMP, &*TempTC, true)) Archs.insert(Arch); } if (!AMDTriple && !NVPTXTriple) { for (StringRef Arch : getOffloadArchs(C, C.getArgs(), Action::OFK_OpenMP, nullptr, true)) Archs.insert(Arch); } for (StringRef Arch : Archs) { if (NVPTXTriple && IsNVIDIAGpuArch(StringToCudaArch( getProcessorFromTargetID(*NVPTXTriple, Arch)))) { DerivedArchs[NVPTXTriple->getTriple()].insert(Arch); } else if (AMDTriple && IsAMDGpuArch(StringToCudaArch( getProcessorFromTargetID(*AMDTriple, Arch)))) { DerivedArchs[AMDTriple->getTriple()].insert(Arch); } else { Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch) << Arch; return; } } // If the set is empty then we failed to find a native architecture. if (Archs.empty()) { Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch) << "native"; return; } for (const auto &TripleAndArchs : DerivedArchs) OpenMPTriples.insert(TripleAndArchs.first()); } for (StringRef Val : OpenMPTriples) { llvm::Triple TT(ToolChain::getOpenMPTriple(Val)); std::string NormalizedName = TT.normalize(); // Make sure we don't have a duplicate triple. auto Duplicate = FoundNormalizedTriples.find(NormalizedName); if (Duplicate != FoundNormalizedTriples.end()) { Diag(clang::diag::warn_drv_omp_offload_target_duplicate) << Val << Duplicate->second; continue; } // Store the current triple so that we can check for duplicates in the // following iterations. FoundNormalizedTriples[NormalizedName] = Val; // If the specified target is invalid, emit a diagnostic. if (TT.getArch() == llvm::Triple::UnknownArch) Diag(clang::diag::err_drv_invalid_omp_target) << Val; else { const ToolChain *TC; // Device toolchains have to be selected differently. They pair host // and device in their implementation. if (TT.isNVPTX() || TT.isAMDGCN()) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "Host toolchain should be always defined."); auto &DeviceTC = ToolChains[TT.str() + "/" + HostTC->getTriple().normalize()]; if (!DeviceTC) { if (TT.isNVPTX()) DeviceTC = std::make_unique( *this, TT, *HostTC, C.getInputArgs()); else if (TT.isAMDGCN()) DeviceTC = std::make_unique( *this, TT, *HostTC, C.getInputArgs()); else assert(DeviceTC && "Device toolchain not defined."); } TC = DeviceTC.get(); } else TC = &getToolChain(C.getInputArgs(), TT); C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP); if (DerivedArchs.contains(TT.getTriple())) KnownArchs[TC] = DerivedArchs[TT.getTriple()]; } } } else if (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ)) { Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets); return; } // // TODO: Add support for other offloading programming models here. // } static void appendOneArg(InputArgList &Args, const Arg *Opt, const Arg *BaseArg) { // The args for config files or /clang: flags belong to different InputArgList // objects than Args. This copies an Arg from one of those other InputArgLists // to the ownership of Args. unsigned Index = Args.MakeIndex(Opt->getSpelling()); Arg *Copy = new llvm::opt::Arg(Opt->getOption(), Args.getArgString(Index), Index, BaseArg); Copy->getValues() = Opt->getValues(); if (Opt->isClaimed()) Copy->claim(); Copy->setOwnsValues(Opt->getOwnsValues()); Opt->setOwnsValues(false); Args.append(Copy); } bool Driver::readConfigFile(StringRef FileName, llvm::cl::ExpansionContext &ExpCtx) { // Try opening the given file. auto Status = getVFS().status(FileName); if (!Status) { Diag(diag::err_drv_cannot_open_config_file) << FileName << Status.getError().message(); return true; } if (Status->getType() != llvm::sys::fs::file_type::regular_file) { Diag(diag::err_drv_cannot_open_config_file) << FileName << "not a regular file"; return true; } // Try reading the given file. SmallVector NewCfgArgs; if (llvm::Error Err = ExpCtx.readConfigFile(FileName, NewCfgArgs)) { Diag(diag::err_drv_cannot_read_config_file) << FileName << toString(std::move(Err)); return true; } // Read options from config file. llvm::SmallString<128> CfgFileName(FileName); llvm::sys::path::native(CfgFileName); bool ContainErrors; std::unique_ptr NewOptions = std::make_unique( ParseArgStrings(NewCfgArgs, IsCLMode(), ContainErrors)); if (ContainErrors) return true; // Claim all arguments that come from a configuration file so that the driver // does not warn on any that is unused. for (Arg *A : *NewOptions) A->claim(); if (!CfgOptions) CfgOptions = std::move(NewOptions); else { // If this is a subsequent config file, append options to the previous one. for (auto *Opt : *NewOptions) { const Arg *BaseArg = &Opt->getBaseArg(); if (BaseArg == Opt) BaseArg = nullptr; appendOneArg(*CfgOptions, Opt, BaseArg); } } ConfigFiles.push_back(std::string(CfgFileName)); return false; } bool Driver::loadConfigFiles() { llvm::cl::ExpansionContext ExpCtx(Saver.getAllocator(), llvm::cl::tokenizeConfigFile); ExpCtx.setVFS(&getVFS()); // Process options that change search path for config files. if (CLOptions) { if (CLOptions->hasArg(options::OPT_config_system_dir_EQ)) { SmallString<128> CfgDir; CfgDir.append( CLOptions->getLastArgValue(options::OPT_config_system_dir_EQ)); if (CfgDir.empty() || getVFS().makeAbsolute(CfgDir)) SystemConfigDir.clear(); else SystemConfigDir = static_cast(CfgDir); } if (CLOptions->hasArg(options::OPT_config_user_dir_EQ)) { SmallString<128> CfgDir; llvm::sys::fs::expand_tilde( CLOptions->getLastArgValue(options::OPT_config_user_dir_EQ), CfgDir); if (CfgDir.empty() || getVFS().makeAbsolute(CfgDir)) UserConfigDir.clear(); else UserConfigDir = static_cast(CfgDir); } } // Prepare list of directories where config file is searched for. StringRef CfgFileSearchDirs[] = {UserConfigDir, SystemConfigDir, Dir}; ExpCtx.setSearchDirs(CfgFileSearchDirs); // First try to load configuration from the default files, return on error. if (loadDefaultConfigFiles(ExpCtx)) return true; // Then load configuration files specified explicitly. SmallString<128> CfgFilePath; if (CLOptions) { for (auto CfgFileName : CLOptions->getAllArgValues(options::OPT_config)) { // If argument contains directory separator, treat it as a path to // configuration file. if (llvm::sys::path::has_parent_path(CfgFileName)) { CfgFilePath.assign(CfgFileName); if (llvm::sys::path::is_relative(CfgFilePath)) { if (getVFS().makeAbsolute(CfgFilePath)) { Diag(diag::err_drv_cannot_open_config_file) << CfgFilePath << "cannot get absolute path"; return true; } } } else if (!ExpCtx.findConfigFile(CfgFileName, CfgFilePath)) { // Report an error that the config file could not be found. Diag(diag::err_drv_config_file_not_found) << CfgFileName; for (const StringRef &SearchDir : CfgFileSearchDirs) if (!SearchDir.empty()) Diag(diag::note_drv_config_file_searched_in) << SearchDir; return true; } // Try to read the config file, return on error. if (readConfigFile(CfgFilePath, ExpCtx)) return true; } } // No error occurred. return false; } bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) { // Disable default config if CLANG_NO_DEFAULT_CONFIG is set to a non-empty // value. if (const char *NoConfigEnv = ::getenv("CLANG_NO_DEFAULT_CONFIG")) { if (*NoConfigEnv) return false; } if (CLOptions && CLOptions->hasArg(options::OPT_no_default_config)) return false; std::string RealMode = getExecutableForDriverMode(Mode); std::string Triple; // If name prefix is present, no --target= override was passed via CLOptions // and the name prefix is not a valid triple, force it for backwards // compatibility. if (!ClangNameParts.TargetPrefix.empty() && computeTargetTriple(*this, "/invalid/", *CLOptions).str() == "/invalid/") { llvm::Triple PrefixTriple{ClangNameParts.TargetPrefix}; if (PrefixTriple.getArch() == llvm::Triple::UnknownArch || PrefixTriple.isOSUnknown()) Triple = PrefixTriple.str(); } // Otherwise, use the real triple as used by the driver. if (Triple.empty()) { llvm::Triple RealTriple = computeTargetTriple(*this, TargetTriple, *CLOptions); Triple = RealTriple.str(); assert(!Triple.empty()); } // Search for config files in the following order: // 1. -.cfg using real driver mode // (e.g. i386-pc-linux-gnu-clang++.cfg). // 2. -.cfg using executable suffix // (e.g. i386-pc-linux-gnu-clang-g++.cfg for *clang-g++). // 3. .cfg + .cfg using real driver mode // (e.g. i386-pc-linux-gnu.cfg + clang++.cfg). // 4. .cfg + .cfg using executable suffix // (e.g. i386-pc-linux-gnu.cfg + clang-g++.cfg for *clang-g++). // Try loading -.cfg, and return if we find a match. SmallString<128> CfgFilePath; std::string CfgFileName = Triple + '-' + RealMode + ".cfg"; if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath)) return readConfigFile(CfgFilePath, ExpCtx); bool TryModeSuffix = !ClangNameParts.ModeSuffix.empty() && ClangNameParts.ModeSuffix != RealMode; if (TryModeSuffix) { CfgFileName = Triple + '-' + ClangNameParts.ModeSuffix + ".cfg"; if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath)) return readConfigFile(CfgFilePath, ExpCtx); } // Try loading .cfg, and return if loading failed. If a matching file // was not found, still proceed on to try .cfg. CfgFileName = RealMode + ".cfg"; if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath)) { if (readConfigFile(CfgFilePath, ExpCtx)) return true; } else if (TryModeSuffix) { CfgFileName = ClangNameParts.ModeSuffix + ".cfg"; if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath) && readConfigFile(CfgFilePath, ExpCtx)) return true; } // Try loading .cfg and return if we find a match. CfgFileName = Triple + ".cfg"; if (ExpCtx.findConfigFile(CfgFileName, CfgFilePath)) return readConfigFile(CfgFilePath, ExpCtx); // If we were unable to find a config file deduced from executable name, // that is not an error. return false; } Compilation *Driver::BuildCompilation(ArrayRef ArgList) { llvm::PrettyStackTraceString CrashInfo("Compilation construction"); // FIXME: Handle environment options which affect driver behavior, somewhere // (client?). GCC_EXEC_PREFIX, LPATH, CC_PRINT_OPTIONS. // We look for the driver mode option early, because the mode can affect // how other options are parsed. auto DriverMode = getDriverMode(ClangExecutable, ArgList.slice(1)); if (!DriverMode.empty()) setDriverMode(DriverMode); // FIXME: What are we going to do with -V and -b? // Arguments specified in command line. bool ContainsError; CLOptions = std::make_unique( ParseArgStrings(ArgList.slice(1), IsCLMode(), ContainsError)); // Try parsing configuration file. if (!ContainsError) ContainsError = loadConfigFiles(); bool HasConfigFile = !ContainsError && (CfgOptions.get() != nullptr); // All arguments, from both config file and command line. InputArgList Args = std::move(HasConfigFile ? std::move(*CfgOptions) : std::move(*CLOptions)); if (HasConfigFile) for (auto *Opt : *CLOptions) { if (Opt->getOption().matches(options::OPT_config)) continue; const Arg *BaseArg = &Opt->getBaseArg(); if (BaseArg == Opt) BaseArg = nullptr; appendOneArg(Args, Opt, BaseArg); } // In CL mode, look for any pass-through arguments if (IsCLMode() && !ContainsError) { SmallVector CLModePassThroughArgList; for (const auto *A : Args.filtered(options::OPT__SLASH_clang)) { A->claim(); CLModePassThroughArgList.push_back(A->getValue()); } if (!CLModePassThroughArgList.empty()) { // Parse any pass through args using default clang processing rather // than clang-cl processing. auto CLModePassThroughOptions = std::make_unique( ParseArgStrings(CLModePassThroughArgList, false, ContainsError)); if (!ContainsError) for (auto *Opt : *CLModePassThroughOptions) { appendOneArg(Args, Opt, nullptr); } } } // Check for working directory option before accessing any files if (Arg *WD = Args.getLastArg(options::OPT_working_directory)) if (VFS->setCurrentWorkingDirectory(WD->getValue())) Diag(diag::err_drv_unable_to_set_working_directory) << WD->getValue(); // FIXME: This stuff needs to go into the Compilation, not the driver. bool CCCPrintPhases; // -canonical-prefixes, -no-canonical-prefixes are used very early in main. Args.ClaimAllArgs(options::OPT_canonical_prefixes); Args.ClaimAllArgs(options::OPT_no_canonical_prefixes); // f(no-)integated-cc1 is also used very early in main. Args.ClaimAllArgs(options::OPT_fintegrated_cc1); Args.ClaimAllArgs(options::OPT_fno_integrated_cc1); // Ignore -pipe. Args.ClaimAllArgs(options::OPT_pipe); // Extract -ccc args. // // FIXME: We need to figure out where this behavior should live. Most of it // should be outside in the client; the parts that aren't should have proper // options, either by introducing new ones or by overloading gcc ones like -V // or -b. CCCPrintPhases = Args.hasArg(options::OPT_ccc_print_phases); CCCPrintBindings = Args.hasArg(options::OPT_ccc_print_bindings); if (const Arg *A = Args.getLastArg(options::OPT_ccc_gcc_name)) CCCGenericGCCName = A->getValue(); // Process -fproc-stat-report options. if (const Arg *A = Args.getLastArg(options::OPT_fproc_stat_report_EQ)) { CCPrintProcessStats = true; CCPrintStatReportFilename = A->getValue(); } if (Args.hasArg(options::OPT_fproc_stat_report)) CCPrintProcessStats = true; // FIXME: TargetTriple is used by the target-prefixed calls to as/ld // and getToolChain is const. if (IsCLMode()) { // clang-cl targets MSVC-style Win32. llvm::Triple T(TargetTriple); T.setOS(llvm::Triple::Win32); T.setVendor(llvm::Triple::PC); T.setEnvironment(llvm::Triple::MSVC); T.setObjectFormat(llvm::Triple::COFF); if (Args.hasArg(options::OPT__SLASH_arm64EC)) T.setArch(llvm::Triple::aarch64, llvm::Triple::AArch64SubArch_arm64ec); TargetTriple = T.str(); } else if (IsDXCMode()) { // Build TargetTriple from target_profile option for clang-dxc. if (const Arg *A = Args.getLastArg(options::OPT_target_profile)) { StringRef TargetProfile = A->getValue(); if (auto Triple = toolchains::HLSLToolChain::parseTargetProfile(TargetProfile)) TargetTriple = *Triple; else Diag(diag::err_drv_invalid_directx_shader_module) << TargetProfile; A->claim(); } else { Diag(diag::err_drv_dxc_missing_target_profile); } } if (const Arg *A = Args.getLastArg(options::OPT_target)) TargetTriple = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT_ccc_install_dir)) Dir = InstalledDir = A->getValue(); for (const Arg *A : Args.filtered(options::OPT_B)) { A->claim(); PrefixDirs.push_back(A->getValue(0)); } if (std::optional CompilerPathValue = llvm::sys::Process::GetEnv("COMPILER_PATH")) { StringRef CompilerPath = *CompilerPathValue; while (!CompilerPath.empty()) { std::pair Split = CompilerPath.split(llvm::sys::EnvPathSeparator); PrefixDirs.push_back(std::string(Split.first)); CompilerPath = Split.second; } } if (const Arg *A = Args.getLastArg(options::OPT__sysroot_EQ)) SysRoot = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT__dyld_prefix_EQ)) DyldPrefix = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT_resource_dir)) ResourceDir = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT_save_temps_EQ)) { SaveTemps = llvm::StringSwitch(A->getValue()) .Case("cwd", SaveTempsCwd) .Case("obj", SaveTempsObj) .Default(SaveTempsCwd); } if (const Arg *A = Args.getLastArg(options::OPT_offload_host_only, options::OPT_offload_device_only, options::OPT_offload_host_device)) { if (A->getOption().matches(options::OPT_offload_host_only)) Offload = OffloadHost; else if (A->getOption().matches(options::OPT_offload_device_only)) Offload = OffloadDevice; else Offload = OffloadHostDevice; } setLTOMode(Args); // Process -fembed-bitcode= flags. if (Arg *A = Args.getLastArg(options::OPT_fembed_bitcode_EQ)) { StringRef Name = A->getValue(); unsigned Model = llvm::StringSwitch(Name) .Case("off", EmbedNone) .Case("all", EmbedBitcode) .Case("bitcode", EmbedBitcode) .Case("marker", EmbedMarker) .Default(~0U); if (Model == ~0U) { Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Name; } else BitcodeEmbed = static_cast(Model); } // Remove existing compilation database so that each job can append to it. if (Arg *A = Args.getLastArg(options::OPT_MJ)) llvm::sys::fs::remove(A->getValue()); // Setting up the jobs for some precompile cases depends on whether we are // treating them as PCH, implicit modules or C++20 ones. // TODO: inferring the mode like this seems fragile (it meets the objective // of not requiring anything new for operation, however). const Arg *Std = Args.getLastArg(options::OPT_std_EQ); ModulesModeCXX20 = !Args.hasArg(options::OPT_fmodules) && Std && (Std->containsValue("c++20") || Std->containsValue("c++2a") || Std->containsValue("c++23") || Std->containsValue("c++2b") || Std->containsValue("c++26") || Std->containsValue("c++2c") || Std->containsValue("c++latest")); // Process -fmodule-header{=} flags. if (Arg *A = Args.getLastArg(options::OPT_fmodule_header_EQ, options::OPT_fmodule_header)) { // These flags force C++20 handling of headers. ModulesModeCXX20 = true; if (A->getOption().matches(options::OPT_fmodule_header)) CXX20HeaderType = HeaderMode_Default; else { StringRef ArgName = A->getValue(); unsigned Kind = llvm::StringSwitch(ArgName) .Case("user", HeaderMode_User) .Case("system", HeaderMode_System) .Default(~0U); if (Kind == ~0U) { Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << ArgName; } else CXX20HeaderType = static_cast(Kind); } } std::unique_ptr UArgs = std::make_unique(std::move(Args)); // Perform the default argument translations. DerivedArgList *TranslatedArgs = TranslateInputArgs(*UArgs); // Owned by the host. const ToolChain &TC = getToolChain( *UArgs, computeTargetTriple(*this, TargetTriple, *UArgs)); // Report warning when arm64EC option is overridden by specified target if ((TC.getTriple().getArch() != llvm::Triple::aarch64 || TC.getTriple().getSubArch() != llvm::Triple::AArch64SubArch_arm64ec) && UArgs->hasArg(options::OPT__SLASH_arm64EC)) { getDiags().Report(clang::diag::warn_target_override_arm64ec) << TC.getTriple().str(); } // A common user mistake is specifying a target of aarch64-none-eabi or // arm-none-elf whereas the correct names are aarch64-none-elf & // arm-none-eabi. Detect these cases and issue a warning. if (TC.getTriple().getOS() == llvm::Triple::UnknownOS && TC.getTriple().getVendor() == llvm::Triple::UnknownVendor) { switch (TC.getTriple().getArch()) { case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: if (TC.getTriple().getEnvironmentName() == "elf") { Diag(diag::warn_target_unrecognized_env) << TargetTriple << (TC.getTriple().getArchName().str() + "-none-eabi"); } break; case llvm::Triple::aarch64: case llvm::Triple::aarch64_be: case llvm::Triple::aarch64_32: if (TC.getTriple().getEnvironmentName().startswith("eabi")) { Diag(diag::warn_target_unrecognized_env) << TargetTriple << (TC.getTriple().getArchName().str() + "-none-elf"); } break; default: break; } } // The compilation takes ownership of Args. Compilation *C = new Compilation(*this, TC, UArgs.release(), TranslatedArgs, ContainsError); if (!HandleImmediateArgs(*C)) return C; // Construct the list of inputs. InputList Inputs; BuildInputs(C->getDefaultToolChain(), *TranslatedArgs, Inputs); // Populate the tool chains for the offloading devices, if any. CreateOffloadingDeviceToolChains(*C, Inputs); // Construct the list of abstract actions to perform for this compilation. On // MachO targets this uses the driver-driver and universal actions. if (TC.getTriple().isOSBinFormatMachO()) BuildUniversalActions(*C, C->getDefaultToolChain(), Inputs); else BuildActions(*C, C->getArgs(), Inputs, C->getActions()); if (CCCPrintPhases) { PrintActions(*C); return C; } BuildJobs(*C); return C; } static void printArgList(raw_ostream &OS, const llvm::opt::ArgList &Args) { llvm::opt::ArgStringList ASL; for (const auto *A : Args) { // Use user's original spelling of flags. For example, use // `/source-charset:utf-8` instead of `-finput-charset=utf-8` if the user // wrote the former. while (A->getAlias()) A = A->getAlias(); A->render(Args, ASL); } for (auto I = ASL.begin(), E = ASL.end(); I != E; ++I) { if (I != ASL.begin()) OS << ' '; llvm::sys::printArg(OS, *I, true); } OS << '\n'; } bool Driver::getCrashDiagnosticFile(StringRef ReproCrashFilename, SmallString<128> &CrashDiagDir) { using namespace llvm::sys; assert(llvm::Triple(llvm::sys::getProcessTriple()).isOSDarwin() && "Only knows about .crash files on Darwin"); // The .crash file can be found on at ~/Library/Logs/DiagnosticReports/ // (or /Library/Logs/DiagnosticReports for root) and has the filename pattern // clang-__.crash. path::home_directory(CrashDiagDir); if (CrashDiagDir.startswith("/var/root")) CrashDiagDir = "/"; path::append(CrashDiagDir, "Library/Logs/DiagnosticReports"); int PID = #if LLVM_ON_UNIX getpid(); #else 0; #endif std::error_code EC; fs::file_status FileStatus; TimePoint<> LastAccessTime; SmallString<128> CrashFilePath; // Lookup the .crash files and get the one generated by a subprocess spawned // by this driver invocation. for (fs::directory_iterator File(CrashDiagDir, EC), FileEnd; File != FileEnd && !EC; File.increment(EC)) { StringRef FileName = path::filename(File->path()); if (!FileName.startswith(Name)) continue; if (fs::status(File->path(), FileStatus)) continue; llvm::ErrorOr> CrashFile = llvm::MemoryBuffer::getFile(File->path()); if (!CrashFile) continue; // The first line should start with "Process:", otherwise this isn't a real // .crash file. StringRef Data = CrashFile.get()->getBuffer(); if (!Data.startswith("Process:")) continue; // Parse parent process pid line, e.g: "Parent Process: clang-4.0 [79141]" size_t ParentProcPos = Data.find("Parent Process:"); if (ParentProcPos == StringRef::npos) continue; size_t LineEnd = Data.find_first_of("\n", ParentProcPos); if (LineEnd == StringRef::npos) continue; StringRef ParentProcess = Data.slice(ParentProcPos+15, LineEnd).trim(); int OpenBracket = -1, CloseBracket = -1; for (size_t i = 0, e = ParentProcess.size(); i < e; ++i) { if (ParentProcess[i] == '[') OpenBracket = i; if (ParentProcess[i] == ']') CloseBracket = i; } // Extract the parent process PID from the .crash file and check whether // it matches this driver invocation pid. int CrashPID; if (OpenBracket < 0 || CloseBracket < 0 || ParentProcess.slice(OpenBracket + 1, CloseBracket) .getAsInteger(10, CrashPID) || CrashPID != PID) { continue; } // Found a .crash file matching the driver pid. To avoid getting an older // and misleading crash file, continue looking for the most recent. // FIXME: the driver can dispatch multiple cc1 invocations, leading to // multiple crashes poiting to the same parent process. Since the driver // does not collect pid information for the dispatched invocation there's // currently no way to distinguish among them. const auto FileAccessTime = FileStatus.getLastModificationTime(); if (FileAccessTime > LastAccessTime) { CrashFilePath.assign(File->path()); LastAccessTime = FileAccessTime; } } // If found, copy it over to the location of other reproducer files. if (!CrashFilePath.empty()) { EC = fs::copy_file(CrashFilePath, ReproCrashFilename); if (EC) return false; return true; } return false; } static const char BugReporMsg[] = "\n********************\n\n" "PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:\n" "Preprocessed source(s) and associated run script(s) are located at:"; // When clang crashes, produce diagnostic information including the fully // preprocessed source file(s). Request that the developer attach the // diagnostic information to a bug report. void Driver::generateCompilationDiagnostics( Compilation &C, const Command &FailingCommand, StringRef AdditionalInformation, CompilationDiagnosticReport *Report) { if (C.getArgs().hasArg(options::OPT_fno_crash_diagnostics)) return; unsigned Level = 1; if (Arg *A = C.getArgs().getLastArg(options::OPT_fcrash_diagnostics_EQ)) { Level = llvm::StringSwitch(A->getValue()) .Case("off", 0) .Case("compiler", 1) .Case("all", 2) .Default(1); } if (!Level) return; // Don't try to generate diagnostics for dsymutil jobs. if (FailingCommand.getCreator().isDsymutilJob()) return; bool IsLLD = false; ArgStringList SavedTemps; if (FailingCommand.getCreator().isLinkJob()) { C.getDefaultToolChain().GetLinkerPath(&IsLLD); if (!IsLLD || Level < 2) return; // If lld crashed, we will re-run the same command with the input it used // to have. In that case we should not remove temp files in // initCompilationForDiagnostics yet. They will be added back and removed // later. SavedTemps = std::move(C.getTempFiles()); assert(!C.getTempFiles().size()); } // Print the version of the compiler. PrintVersion(C, llvm::errs()); // Suppress driver output and emit preprocessor output to temp file. CCGenDiagnostics = true; // Save the original job command(s). Command Cmd = FailingCommand; // Keep track of whether we produce any errors while trying to produce // preprocessed sources. DiagnosticErrorTrap Trap(Diags); // Suppress tool output. C.initCompilationForDiagnostics(); // If lld failed, rerun it again with --reproduce. if (IsLLD) { const char *TmpName = CreateTempFile(C, "linker-crash", "tar"); Command NewLLDInvocation = Cmd; llvm::opt::ArgStringList ArgList = NewLLDInvocation.getArguments(); StringRef ReproduceOption = C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment() ? "/reproduce:" : "--reproduce="; ArgList.push_back(Saver.save(Twine(ReproduceOption) + TmpName).data()); NewLLDInvocation.replaceArguments(std::move(ArgList)); // Redirect stdout/stderr to /dev/null. NewLLDInvocation.Execute({std::nullopt, {""}, {""}}, nullptr, nullptr); Diag(clang::diag::note_drv_command_failed_diag_msg) << BugReporMsg; Diag(clang::diag::note_drv_command_failed_diag_msg) << TmpName; Diag(clang::diag::note_drv_command_failed_diag_msg) << "\n\n********************"; if (Report) Report->TemporaryFiles.push_back(TmpName); return; } // Construct the list of inputs. InputList Inputs; BuildInputs(C.getDefaultToolChain(), C.getArgs(), Inputs); for (InputList::iterator it = Inputs.begin(), ie = Inputs.end(); it != ie;) { bool IgnoreInput = false; // Ignore input from stdin or any inputs that cannot be preprocessed. // Check type first as not all linker inputs have a value. if (types::getPreprocessedType(it->first) == types::TY_INVALID) { IgnoreInput = true; } else if (!strcmp(it->second->getValue(), "-")) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating preprocessed source(s) - " "ignoring input from stdin."; IgnoreInput = true; } if (IgnoreInput) { it = Inputs.erase(it); ie = Inputs.end(); } else { ++it; } } if (Inputs.empty()) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating preprocessed source(s) - " "no preprocessable inputs."; return; } // Don't attempt to generate preprocessed files if multiple -arch options are // used, unless they're all duplicates. llvm::StringSet<> ArchNames; for (const Arg *A : C.getArgs()) { if (A->getOption().matches(options::OPT_arch)) { StringRef ArchName = A->getValue(); ArchNames.insert(ArchName); } } if (ArchNames.size() > 1) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating preprocessed source(s) - cannot generate " "preprocessed source with multiple -arch options."; return; } // Construct the list of abstract actions to perform for this compilation. On // Darwin OSes this uses the driver-driver and builds universal actions. const ToolChain &TC = C.getDefaultToolChain(); if (TC.getTriple().isOSBinFormatMachO()) BuildUniversalActions(C, TC, Inputs); else BuildActions(C, C.getArgs(), Inputs, C.getActions()); BuildJobs(C); // If there were errors building the compilation, quit now. if (Trap.hasErrorOccurred()) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating preprocessed source(s)."; return; } // Generate preprocessed output. SmallVector, 4> FailingCommands; C.ExecuteJobs(C.getJobs(), FailingCommands); // If any of the preprocessing commands failed, clean up and exit. if (!FailingCommands.empty()) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating preprocessed source(s)."; return; } const ArgStringList &TempFiles = C.getTempFiles(); if (TempFiles.empty()) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating preprocessed source(s)."; return; } Diag(clang::diag::note_drv_command_failed_diag_msg) << BugReporMsg; SmallString<128> VFS; SmallString<128> ReproCrashFilename; for (const char *TempFile : TempFiles) { Diag(clang::diag::note_drv_command_failed_diag_msg) << TempFile; if (Report) Report->TemporaryFiles.push_back(TempFile); if (ReproCrashFilename.empty()) { ReproCrashFilename = TempFile; llvm::sys::path::replace_extension(ReproCrashFilename, ".crash"); } if (StringRef(TempFile).endswith(".cache")) { // In some cases (modules) we'll dump extra data to help with reproducing // the crash into a directory next to the output. VFS = llvm::sys::path::filename(TempFile); llvm::sys::path::append(VFS, "vfs", "vfs.yaml"); } } for (const char *TempFile : SavedTemps) C.addTempFile(TempFile); // Assume associated files are based off of the first temporary file. CrashReportInfo CrashInfo(TempFiles[0], VFS); llvm::SmallString<128> Script(CrashInfo.Filename); llvm::sys::path::replace_extension(Script, "sh"); std::error_code EC; llvm::raw_fd_ostream ScriptOS(Script, EC, llvm::sys::fs::CD_CreateNew, llvm::sys::fs::FA_Write, llvm::sys::fs::OF_Text); if (EC) { Diag(clang::diag::note_drv_command_failed_diag_msg) << "Error generating run script: " << Script << " " << EC.message(); } else { ScriptOS << "# Crash reproducer for " << getClangFullVersion() << "\n" << "# Driver args: "; printArgList(ScriptOS, C.getInputArgs()); ScriptOS << "# Original command: "; Cmd.Print(ScriptOS, "\n", /*Quote=*/true); Cmd.Print(ScriptOS, "\n", /*Quote=*/true, &CrashInfo); if (!AdditionalInformation.empty()) ScriptOS << "\n# Additional information: " << AdditionalInformation << "\n"; if (Report) Report->TemporaryFiles.push_back(std::string(Script.str())); Diag(clang::diag::note_drv_command_failed_diag_msg) << Script; } // On darwin, provide information about the .crash diagnostic report. if (llvm::Triple(llvm::sys::getProcessTriple()).isOSDarwin()) { SmallString<128> CrashDiagDir; if (getCrashDiagnosticFile(ReproCrashFilename, CrashDiagDir)) { Diag(clang::diag::note_drv_command_failed_diag_msg) << ReproCrashFilename.str(); } else { // Suggest a directory for the user to look for .crash files. llvm::sys::path::append(CrashDiagDir, Name); CrashDiagDir += "__.crash"; Diag(clang::diag::note_drv_command_failed_diag_msg) << "Crash backtrace is located in"; Diag(clang::diag::note_drv_command_failed_diag_msg) << CrashDiagDir.str(); Diag(clang::diag::note_drv_command_failed_diag_msg) << "(choose the .crash file that corresponds to your crash)"; } } Diag(clang::diag::note_drv_command_failed_diag_msg) << "\n\n********************"; } void Driver::setUpResponseFiles(Compilation &C, Command &Cmd) { // Since commandLineFitsWithinSystemLimits() may underestimate system's // capacity if the tool does not support response files, there is a chance/ // that things will just work without a response file, so we silently just // skip it. if (Cmd.getResponseFileSupport().ResponseKind == ResponseFileSupport::RF_None || llvm::sys::commandLineFitsWithinSystemLimits(Cmd.getExecutable(), Cmd.getArguments())) return; std::string TmpName = GetTemporaryPath("response", "txt"); Cmd.setResponseFile(C.addTempFile(C.getArgs().MakeArgString(TmpName))); } int Driver::ExecuteCompilation( Compilation &C, SmallVectorImpl> &FailingCommands) { if (C.getArgs().hasArg(options::OPT_fdriver_only)) { if (C.getArgs().hasArg(options::OPT_v)) C.getJobs().Print(llvm::errs(), "\n", true); C.ExecuteJobs(C.getJobs(), FailingCommands, /*LogOnly=*/true); // If there were errors building the compilation, quit now. if (!FailingCommands.empty() || Diags.hasErrorOccurred()) return 1; return 0; } // Just print if -### was present. if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)) { C.getJobs().Print(llvm::errs(), "\n", true); return 0; } // If there were errors building the compilation, quit now. if (Diags.hasErrorOccurred()) return 1; // Set up response file names for each command, if necessary. for (auto &Job : C.getJobs()) setUpResponseFiles(C, Job); C.ExecuteJobs(C.getJobs(), FailingCommands); // If the command succeeded, we are done. if (FailingCommands.empty()) return 0; // Otherwise, remove result files and print extra information about abnormal // failures. int Res = 0; for (const auto &CmdPair : FailingCommands) { int CommandRes = CmdPair.first; const Command *FailingCommand = CmdPair.second; // Remove result files if we're not saving temps. if (!isSaveTempsEnabled()) { const JobAction *JA = cast(&FailingCommand->getSource()); C.CleanupFileMap(C.getResultFiles(), JA, true); // Failure result files are valid unless we crashed. if (CommandRes < 0) C.CleanupFileMap(C.getFailureResultFiles(), JA, true); } // llvm/lib/Support/*/Signals.inc will exit with a special return code // for SIGPIPE. Do not print diagnostics for this case. if (CommandRes == EX_IOERR) { Res = CommandRes; continue; } // Print extra information about abnormal failures, if possible. // // This is ad-hoc, but we don't want to be excessively noisy. If the result // status was 1, assume the command failed normally. In particular, if it // was the compiler then assume it gave a reasonable error code. Failures // in other tools are less common, and they generally have worse // diagnostics, so always print the diagnostic there. const Tool &FailingTool = FailingCommand->getCreator(); if (!FailingCommand->getCreator().hasGoodDiagnostics() || CommandRes != 1) { // FIXME: See FIXME above regarding result code interpretation. if (CommandRes < 0) Diag(clang::diag::err_drv_command_signalled) << FailingTool.getShortName(); else Diag(clang::diag::err_drv_command_failed) << FailingTool.getShortName() << CommandRes; } } return Res; } void Driver::PrintHelp(bool ShowHidden) const { unsigned IncludedFlagsBitmask; unsigned ExcludedFlagsBitmask; std::tie(IncludedFlagsBitmask, ExcludedFlagsBitmask) = getIncludeExcludeOptionFlagMasks(IsCLMode()); ExcludedFlagsBitmask |= options::NoDriverOption; if (!ShowHidden) ExcludedFlagsBitmask |= HelpHidden; if (IsFlangMode()) IncludedFlagsBitmask |= options::FlangOption; else ExcludedFlagsBitmask |= options::FlangOnlyOption; std::string Usage = llvm::formatv("{0} [options] file...", Name).str(); getOpts().printHelp(llvm::outs(), Usage.c_str(), DriverTitle.c_str(), IncludedFlagsBitmask, ExcludedFlagsBitmask, /*ShowAllAliases=*/false); } void Driver::PrintVersion(const Compilation &C, raw_ostream &OS) const { if (IsFlangMode()) { OS << getClangToolFullVersion("flang-new") << '\n'; } else { // FIXME: The following handlers should use a callback mechanism, we don't // know what the client would like to do. OS << getClangFullVersion() << '\n'; } const ToolChain &TC = C.getDefaultToolChain(); OS << "Target: " << TC.getTripleString() << '\n'; // Print the threading model. if (Arg *A = C.getArgs().getLastArg(options::OPT_mthread_model)) { // Don't print if the ToolChain would have barfed on it already if (TC.isThreadModelSupported(A->getValue())) OS << "Thread model: " << A->getValue(); } else OS << "Thread model: " << TC.getThreadModel(); OS << '\n'; // Print out the install directory. OS << "InstalledDir: " << InstalledDir << '\n'; // If configuration files were used, print their paths. for (auto ConfigFile : ConfigFiles) OS << "Configuration file: " << ConfigFile << '\n'; } /// PrintDiagnosticCategories - Implement the --print-diagnostic-categories /// option. static void PrintDiagnosticCategories(raw_ostream &OS) { // Skip the empty category. for (unsigned i = 1, max = DiagnosticIDs::getNumberOfCategories(); i != max; ++i) OS << i << ',' << DiagnosticIDs::getCategoryNameFromID(i) << '\n'; } void Driver::HandleAutocompletions(StringRef PassedFlags) const { if (PassedFlags == "") return; // Print out all options that start with a given argument. This is used for // shell autocompletion. std::vector SuggestedCompletions; std::vector Flags; unsigned int DisableFlags = options::NoDriverOption | options::Unsupported | options::Ignored; // Make sure that Flang-only options don't pollute the Clang output // TODO: Make sure that Clang-only options don't pollute Flang output if (!IsFlangMode()) DisableFlags |= options::FlangOnlyOption; // Distinguish "--autocomplete=-someflag" and "--autocomplete=-someflag," // because the latter indicates that the user put space before pushing tab // which should end up in a file completion. const bool HasSpace = PassedFlags.endswith(","); // Parse PassedFlags by "," as all the command-line flags are passed to this // function separated by "," StringRef TargetFlags = PassedFlags; while (TargetFlags != "") { StringRef CurFlag; std::tie(CurFlag, TargetFlags) = TargetFlags.split(","); Flags.push_back(std::string(CurFlag)); } // We want to show cc1-only options only when clang is invoked with -cc1 or // -Xclang. if (llvm::is_contained(Flags, "-Xclang") || llvm::is_contained(Flags, "-cc1")) DisableFlags &= ~options::NoDriverOption; const llvm::opt::OptTable &Opts = getOpts(); StringRef Cur; Cur = Flags.at(Flags.size() - 1); StringRef Prev; if (Flags.size() >= 2) { Prev = Flags.at(Flags.size() - 2); SuggestedCompletions = Opts.suggestValueCompletions(Prev, Cur); } if (SuggestedCompletions.empty()) SuggestedCompletions = Opts.suggestValueCompletions(Cur, ""); // If Flags were empty, it means the user typed `clang [tab]` where we should // list all possible flags. If there was no value completion and the user // pressed tab after a space, we should fall back to a file completion. // We're printing a newline to be consistent with what we print at the end of // this function. if (SuggestedCompletions.empty() && HasSpace && !Flags.empty()) { llvm::outs() << '\n'; return; } // When flag ends with '=' and there was no value completion, return empty // string and fall back to the file autocompletion. if (SuggestedCompletions.empty() && !Cur.endswith("=")) { // If the flag is in the form of "--autocomplete=-foo", // we were requested to print out all option names that start with "-foo". // For example, "--autocomplete=-fsyn" is expanded to "-fsyntax-only". SuggestedCompletions = Opts.findByPrefix(Cur, DisableFlags); // We have to query the -W flags manually as they're not in the OptTable. // TODO: Find a good way to add them to OptTable instead and them remove // this code. for (StringRef S : DiagnosticIDs::getDiagnosticFlags()) if (S.startswith(Cur)) SuggestedCompletions.push_back(std::string(S)); } // Sort the autocomplete candidates so that shells print them out in a // deterministic order. We could sort in any way, but we chose // case-insensitive sorting for consistency with the -help option // which prints out options in the case-insensitive alphabetical order. llvm::sort(SuggestedCompletions, [](StringRef A, StringRef B) { if (int X = A.compare_insensitive(B)) return X < 0; return A.compare(B) > 0; }); llvm::outs() << llvm::join(SuggestedCompletions, "\n") << '\n'; } bool Driver::HandleImmediateArgs(const Compilation &C) { // The order these options are handled in gcc is all over the place, but we // don't expect inconsistencies w.r.t. that to matter in practice. if (C.getArgs().hasArg(options::OPT_dumpmachine)) { llvm::outs() << C.getDefaultToolChain().getTripleString() << '\n'; return false; } if (C.getArgs().hasArg(options::OPT_dumpversion)) { // Since -dumpversion is only implemented for pedantic GCC compatibility, we // return an answer which matches our definition of __VERSION__. llvm::outs() << CLANG_VERSION_STRING << "\n"; return false; } if (C.getArgs().hasArg(options::OPT__print_diagnostic_categories)) { PrintDiagnosticCategories(llvm::outs()); return false; } if (C.getArgs().hasArg(options::OPT_help) || C.getArgs().hasArg(options::OPT__help_hidden)) { PrintHelp(C.getArgs().hasArg(options::OPT__help_hidden)); return false; } if (C.getArgs().hasArg(options::OPT__version)) { // Follow gcc behavior and use stdout for --version and stderr for -v. PrintVersion(C, llvm::outs()); return false; } if (C.getArgs().hasArg(options::OPT_v) || C.getArgs().hasArg(options::OPT__HASH_HASH_HASH) || C.getArgs().hasArg(options::OPT_print_supported_cpus)) { PrintVersion(C, llvm::errs()); SuppressMissingInputWarning = true; } if (C.getArgs().hasArg(options::OPT_v)) { if (!SystemConfigDir.empty()) llvm::errs() << "System configuration file directory: " << SystemConfigDir << "\n"; if (!UserConfigDir.empty()) llvm::errs() << "User configuration file directory: " << UserConfigDir << "\n"; } const ToolChain &TC = C.getDefaultToolChain(); if (C.getArgs().hasArg(options::OPT_v)) TC.printVerboseInfo(llvm::errs()); if (C.getArgs().hasArg(options::OPT_print_resource_dir)) { llvm::outs() << ResourceDir << '\n'; return false; } if (C.getArgs().hasArg(options::OPT_print_search_dirs)) { llvm::outs() << "programs: ="; bool separator = false; // Print -B and COMPILER_PATH. for (const std::string &Path : PrefixDirs) { if (separator) llvm::outs() << llvm::sys::EnvPathSeparator; llvm::outs() << Path; separator = true; } for (const std::string &Path : TC.getProgramPaths()) { if (separator) llvm::outs() << llvm::sys::EnvPathSeparator; llvm::outs() << Path; separator = true; } llvm::outs() << "\n"; llvm::outs() << "libraries: =" << ResourceDir; StringRef sysroot = C.getSysRoot(); for (const std::string &Path : TC.getFilePaths()) { // Always print a separator. ResourceDir was the first item shown. llvm::outs() << llvm::sys::EnvPathSeparator; // Interpretation of leading '=' is needed only for NetBSD. if (Path[0] == '=') llvm::outs() << sysroot << Path.substr(1); else llvm::outs() << Path; } llvm::outs() << "\n"; return false; } if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) { std::string RuntimePath; // Get the first existing path, if any. for (auto Path : TC.getRuntimePaths()) { if (getVFS().exists(Path)) { RuntimePath = Path; break; } } if (!RuntimePath.empty()) llvm::outs() << RuntimePath << '\n'; else llvm::outs() << TC.getCompilerRTPath() << '\n'; return false; } if (C.getArgs().hasArg(options::OPT_print_diagnostic_options)) { std::vector Flags = DiagnosticIDs::getDiagnosticFlags(); for (std::size_t I = 0; I != Flags.size(); I += 2) llvm::outs() << " " << Flags[I] << "\n " << Flags[I + 1] << "\n\n"; return false; } // FIXME: The following handlers should use a callback mechanism, we don't // know what the client would like to do. if (Arg *A = C.getArgs().getLastArg(options::OPT_print_file_name_EQ)) { llvm::outs() << GetFilePath(A->getValue(), TC) << "\n"; return false; } if (Arg *A = C.getArgs().getLastArg(options::OPT_print_prog_name_EQ)) { StringRef ProgName = A->getValue(); // Null program name cannot have a path. if (! ProgName.empty()) llvm::outs() << GetProgramPath(ProgName, TC); llvm::outs() << "\n"; return false; } if (Arg *A = C.getArgs().getLastArg(options::OPT_autocomplete)) { StringRef PassedFlags = A->getValue(); HandleAutocompletions(PassedFlags); return false; } if (C.getArgs().hasArg(options::OPT_print_libgcc_file_name)) { ToolChain::RuntimeLibType RLT = TC.GetRuntimeLibType(C.getArgs()); const llvm::Triple Triple(TC.ComputeEffectiveClangTriple(C.getArgs())); RegisterEffectiveTriple TripleRAII(TC, Triple); switch (RLT) { case ToolChain::RLT_CompilerRT: llvm::outs() << TC.getCompilerRT(C.getArgs(), "builtins") << "\n"; break; case ToolChain::RLT_Libgcc: llvm::outs() << GetFilePath("libgcc.a", TC) << "\n"; break; } return false; } if (C.getArgs().hasArg(options::OPT_print_multi_lib)) { for (const Multilib &Multilib : TC.getMultilibs()) llvm::outs() << Multilib << "\n"; return false; } if (C.getArgs().hasArg(options::OPT_print_multi_flags)) { Multilib::flags_list ArgFlags = TC.getMultilibFlags(C.getArgs()); llvm::StringSet<> ExpandedFlags = TC.getMultilibs().expandFlags(ArgFlags); std::set SortedFlags; for (const auto &FlagEntry : ExpandedFlags) SortedFlags.insert(FlagEntry.getKey()); for (auto Flag : SortedFlags) llvm::outs() << Flag << '\n'; return false; } if (C.getArgs().hasArg(options::OPT_print_multi_directory)) { for (const Multilib &Multilib : TC.getSelectedMultilibs()) { if (Multilib.gccSuffix().empty()) llvm::outs() << ".\n"; else { StringRef Suffix(Multilib.gccSuffix()); assert(Suffix.front() == '/'); llvm::outs() << Suffix.substr(1) << "\n"; } } return false; } if (C.getArgs().hasArg(options::OPT_print_target_triple)) { llvm::outs() << TC.getTripleString() << "\n"; return false; } if (C.getArgs().hasArg(options::OPT_print_effective_triple)) { const llvm::Triple Triple(TC.ComputeEffectiveClangTriple(C.getArgs())); llvm::outs() << Triple.getTriple() << "\n"; return false; } if (C.getArgs().hasArg(options::OPT_print_targets)) { llvm::TargetRegistry::printRegisteredTargetsForVersion(llvm::outs()); return false; } return true; } enum { TopLevelAction = 0, HeadSibAction = 1, OtherSibAction = 2, }; // Display an action graph human-readably. Action A is the "sink" node // and latest-occuring action. Traversal is in pre-order, visiting the // inputs to each action before printing the action itself. static unsigned PrintActions1(const Compilation &C, Action *A, std::map &Ids, Twine Indent = {}, int Kind = TopLevelAction) { if (Ids.count(A)) // A was already visited. return Ids[A]; std::string str; llvm::raw_string_ostream os(str); auto getSibIndent = [](int K) -> Twine { return (K == HeadSibAction) ? " " : (K == OtherSibAction) ? "| " : ""; }; Twine SibIndent = Indent + getSibIndent(Kind); int SibKind = HeadSibAction; os << Action::getClassName(A->getKind()) << ", "; if (InputAction *IA = dyn_cast(A)) { os << "\"" << IA->getInputArg().getValue() << "\""; } else if (BindArchAction *BIA = dyn_cast(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->input_begin(), Ids, SibIndent, SibKind) << "}"; } else if (OffloadAction *OA = dyn_cast(A)) { bool IsFirst = true; OA->doOnEachDependence( [&](Action *A, const ToolChain *TC, const char *BoundArch) { assert(TC && "Unknown host toolchain"); // E.g. for two CUDA device dependences whose bound arch is sm_20 and // sm_35 this will generate: // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device" // (nvptx64-nvidia-cuda:sm_35) {#ID} if (!IsFirst) os << ", "; os << '"'; os << A->getOffloadingKindPrefix(); os << " ("; os << TC->getTriple().normalize(); if (BoundArch) os << ":" << BoundArch; os << ")"; os << '"'; os << " {" << PrintActions1(C, A, Ids, SibIndent, SibKind) << "}"; IsFirst = false; SibKind = OtherSibAction; }); } else { const ActionList *AL = &A->getInputs(); if (AL->size()) { const char *Prefix = "{"; for (Action *PreRequisite : *AL) { os << Prefix << PrintActions1(C, PreRequisite, Ids, SibIndent, SibKind); Prefix = ", "; SibKind = OtherSibAction; } os << "}"; } else os << "{}"; } // Append offload info for all options other than the offloading action // itself (e.g. (cuda-device, sm_20) or (cuda-host)). std::string offload_str; llvm::raw_string_ostream offload_os(offload_str); if (!isa(A)) { auto S = A->getOffloadingKindPrefix(); if (!S.empty()) { offload_os << ", (" << S; if (A->getOffloadingArch()) offload_os << ", " << A->getOffloadingArch(); offload_os << ")"; } } auto getSelfIndent = [](int K) -> Twine { return (K == HeadSibAction) ? "+- " : (K == OtherSibAction) ? "|- " : ""; }; unsigned Id = Ids.size(); Ids[A] = Id; llvm::errs() << Indent + getSelfIndent(Kind) << Id << ": " << os.str() << ", " << types::getTypeName(A->getType()) << offload_os.str() << "\n"; return Id; } // Print the action graphs in a compilation C. // For example "clang -c file1.c file2.c" is composed of two subgraphs. void Driver::PrintActions(const Compilation &C) const { std::map Ids; for (Action *A : C.getActions()) PrintActions1(C, A, Ids); } /// Check whether the given input tree contains any compilation or /// assembly actions. static bool ContainsCompileOrAssembleAction(const Action *A) { if (isa(A) || isa(A) || isa(A)) return true; return llvm::any_of(A->inputs(), ContainsCompileOrAssembleAction); } void Driver::BuildUniversalActions(Compilation &C, const ToolChain &TC, const InputList &BAInputs) const { DerivedArgList &Args = C.getArgs(); ActionList &Actions = C.getActions(); llvm::PrettyStackTraceString CrashInfo("Building universal build actions"); // Collect the list of architectures. Duplicates are allowed, but should only // be handled once (in the order seen). llvm::StringSet<> ArchNames; SmallVector Archs; for (Arg *A : Args) { if (A->getOption().matches(options::OPT_arch)) { // Validate the option here; we don't save the type here because its // particular spelling may participate in other driver choices. llvm::Triple::ArchType Arch = tools::darwin::getArchTypeForMachOArchName(A->getValue()); if (Arch == llvm::Triple::UnknownArch) { Diag(clang::diag::err_drv_invalid_arch_name) << A->getAsString(Args); continue; } A->claim(); if (ArchNames.insert(A->getValue()).second) Archs.push_back(A->getValue()); } } // When there is no explicit arch for this platform, make sure we still bind // the architecture (to the default) so that -Xarch_ is handled correctly. if (!Archs.size()) Archs.push_back(Args.MakeArgString(TC.getDefaultUniversalArchName())); ActionList SingleActions; BuildActions(C, Args, BAInputs, SingleActions); // Add in arch bindings for every top level action, as well as lipo and // dsymutil steps if needed. for (Action* Act : SingleActions) { // Make sure we can lipo this kind of output. If not (and it is an actual // output) then we disallow, since we can't create an output file with the // right name without overwriting it. We could remove this oddity by just // changing the output names to include the arch, which would also fix // -save-temps. Compatibility wins for now. if (Archs.size() > 1 && !types::canLipoType(Act->getType())) Diag(clang::diag::err_drv_invalid_output_with_multiple_archs) << types::getTypeName(Act->getType()); ActionList Inputs; for (unsigned i = 0, e = Archs.size(); i != e; ++i) Inputs.push_back(C.MakeAction(Act, Archs[i])); // Lipo if necessary, we do it this way because we need to set the arch flag // so that -Xarch_ gets overwritten. if (Inputs.size() == 1 || Act->getType() == types::TY_Nothing) Actions.append(Inputs.begin(), Inputs.end()); else Actions.push_back(C.MakeAction(Inputs, Act->getType())); // Handle debug info queries. Arg *A = Args.getLastArg(options::OPT_g_Group); bool enablesDebugInfo = A && !A->getOption().matches(options::OPT_g0) && !A->getOption().matches(options::OPT_gstabs); if ((enablesDebugInfo || willEmitRemarks(Args)) && ContainsCompileOrAssembleAction(Actions.back())) { // Add a 'dsymutil' step if necessary, when debug info is enabled and we // have a compile input. We need to run 'dsymutil' ourselves in such cases // because the debug info will refer to a temporary object file which // will be removed at the end of the compilation process. if (Act->getType() == types::TY_Image) { ActionList Inputs; Inputs.push_back(Actions.back()); Actions.pop_back(); Actions.push_back( C.MakeAction(Inputs, types::TY_dSYM)); } // Verify the debug info output. if (Args.hasArg(options::OPT_verify_debug_info)) { Action* LastAction = Actions.back(); Actions.pop_back(); Actions.push_back(C.MakeAction( LastAction, types::TY_Nothing)); } } } } bool Driver::DiagnoseInputExistence(const DerivedArgList &Args, StringRef Value, types::ID Ty, bool TypoCorrect) const { if (!getCheckInputsExist()) return true; // stdin always exists. if (Value == "-") return true; // If it's a header to be found in the system or user search path, then defer // complaints about its absence until those searches can be done. When we // are definitely processing headers for C++20 header units, extend this to // allow the user to put "-fmodule-header -xc++-header vector" for example. if (Ty == types::TY_CXXSHeader || Ty == types::TY_CXXUHeader || (ModulesModeCXX20 && Ty == types::TY_CXXHeader)) return true; if (getVFS().exists(Value)) return true; if (TypoCorrect) { // Check if the filename is a typo for an option flag. OptTable thinks // that all args that are not known options and that start with / are // filenames, but e.g. `/diagnostic:caret` is more likely a typo for // the option `/diagnostics:caret` than a reference to a file in the root // directory. unsigned IncludedFlagsBitmask; unsigned ExcludedFlagsBitmask; std::tie(IncludedFlagsBitmask, ExcludedFlagsBitmask) = getIncludeExcludeOptionFlagMasks(IsCLMode()); std::string Nearest; if (getOpts().findNearest(Value, Nearest, IncludedFlagsBitmask, ExcludedFlagsBitmask) <= 1) { Diag(clang::diag::err_drv_no_such_file_with_suggestion) << Value << Nearest; return false; } } // In CL mode, don't error on apparently non-existent linker inputs, because // they can be influenced by linker flags the clang driver might not // understand. // Examples: // - `clang-cl main.cc ole32.lib` in a non-MSVC shell will make the driver // module look for an MSVC installation in the registry. (We could ask // the MSVCToolChain object if it can find `ole32.lib`, but the logic to // look in the registry might move into lld-link in the future so that // lld-link invocations in non-MSVC shells just work too.) // - `clang-cl ... /link ...` can pass arbitrary flags to the linker, // including /libpath:, which is used to find .lib and .obj files. // So do not diagnose this on the driver level. Rely on the linker diagnosing // it. (If we don't end up invoking the linker, this means we'll emit a // "'linker' input unused [-Wunused-command-line-argument]" warning instead // of an error.) // // Only do this skip after the typo correction step above. `/Brepo` is treated // as TY_Object, but it's clearly a typo for `/Brepro`. It seems fine to emit // an error if we have a flag that's within an edit distance of 1 from a // flag. (Users can use `-Wl,` or `/linker` to launder the flag past the // driver in the unlikely case they run into this.) // // Don't do this for inputs that start with a '/', else we'd pass options // like /libpath: through to the linker silently. // // Emitting an error for linker inputs can also cause incorrect diagnostics // with the gcc driver. The command // clang -fuse-ld=lld -Wl,--chroot,some/dir /file.o // will make lld look for some/dir/file.o, while we will diagnose here that // `/file.o` does not exist. However, configure scripts check if // `clang /GR-` compiles without error to see if the compiler is cl.exe, // so we can't downgrade diagnostics for `/GR-` from an error to a warning // in cc mode. (We can in cl mode because cl.exe itself only warns on // unknown flags.) if (IsCLMode() && Ty == types::TY_Object && !Value.startswith("/")) return true; Diag(clang::diag::err_drv_no_such_file) << Value; return false; } // Get the C++20 Header Unit type corresponding to the input type. static types::ID CXXHeaderUnitType(ModuleHeaderMode HM) { switch (HM) { case HeaderMode_User: return types::TY_CXXUHeader; case HeaderMode_System: return types::TY_CXXSHeader; case HeaderMode_Default: break; case HeaderMode_None: llvm_unreachable("should not be called in this case"); } return types::TY_CXXHUHeader; } // Construct a the list of inputs and their types. void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args, InputList &Inputs) const { const llvm::opt::OptTable &Opts = getOpts(); // Track the current user specified (-x) input. We also explicitly track the // argument used to set the type; we only want to claim the type when we // actually use it, so we warn about unused -x arguments. types::ID InputType = types::TY_Nothing; Arg *InputTypeArg = nullptr; // The last /TC or /TP option sets the input type to C or C++ globally. if (Arg *TCTP = Args.getLastArgNoClaim(options::OPT__SLASH_TC, options::OPT__SLASH_TP)) { InputTypeArg = TCTP; InputType = TCTP->getOption().matches(options::OPT__SLASH_TC) ? types::TY_C : types::TY_CXX; Arg *Previous = nullptr; bool ShowNote = false; for (Arg *A : Args.filtered(options::OPT__SLASH_TC, options::OPT__SLASH_TP)) { if (Previous) { Diag(clang::diag::warn_drv_overriding_flag_option) << Previous->getSpelling() << A->getSpelling(); ShowNote = true; } Previous = A; } if (ShowNote) Diag(clang::diag::note_drv_t_option_is_global); } // Warn -x after last input file has no effect if (!IsCLMode()) { Arg *LastXArg = Args.getLastArgNoClaim(options::OPT_x); Arg *LastInputArg = Args.getLastArgNoClaim(options::OPT_INPUT); if (LastXArg && LastInputArg && LastInputArg->getIndex() < LastXArg->getIndex()) Diag(clang::diag::warn_drv_unused_x) << LastXArg->getValue(); } else { // In CL mode suggest /TC or /TP since -x doesn't make sense if passed via // /clang:. if (auto *A = Args.getLastArg(options::OPT_x)) Diag(diag::err_drv_unsupported_opt_with_suggestion) << A->getAsString(Args) << "/TC' or '/TP"; } for (Arg *A : Args) { if (A->getOption().getKind() == Option::InputClass) { const char *Value = A->getValue(); types::ID Ty = types::TY_INVALID; // Infer the input type if necessary. if (InputType == types::TY_Nothing) { // If there was an explicit arg for this, claim it. if (InputTypeArg) InputTypeArg->claim(); // stdin must be handled specially. if (memcmp(Value, "-", 2) == 0) { if (IsFlangMode()) { Ty = types::TY_Fortran; } else { // If running with -E, treat as a C input (this changes the // builtin macros, for example). This may be overridden by -ObjC // below. // // Otherwise emit an error but still use a valid type to avoid // spurious errors (e.g., no inputs). assert(!CCGenDiagnostics && "stdin produces no crash reproducer"); if (!Args.hasArgNoClaim(options::OPT_E) && !CCCIsCPP()) Diag(IsCLMode() ? clang::diag::err_drv_unknown_stdin_type_clang_cl : clang::diag::err_drv_unknown_stdin_type); Ty = types::TY_C; } } else { // Otherwise lookup by extension. // Fallback is C if invoked as C preprocessor, C++ if invoked with // clang-cl /E, or Object otherwise. // We use a host hook here because Darwin at least has its own // idea of what .s is. if (const char *Ext = strrchr(Value, '.')) Ty = TC.LookupTypeForExtension(Ext + 1); if (Ty == types::TY_INVALID) { if (IsCLMode() && (Args.hasArgNoClaim(options::OPT_E) || CCGenDiagnostics)) Ty = types::TY_CXX; else if (CCCIsCPP() || CCGenDiagnostics) Ty = types::TY_C; else Ty = types::TY_Object; } // If the driver is invoked as C++ compiler (like clang++ or c++) it // should autodetect some input files as C++ for g++ compatibility. if (CCCIsCXX()) { types::ID OldTy = Ty; Ty = types::lookupCXXTypeForCType(Ty); // Do not complain about foo.h, when we are known to be processing // it as a C++20 header unit. if (Ty != OldTy && !(OldTy == types::TY_CHeader && hasHeaderMode())) Diag(clang::diag::warn_drv_treating_input_as_cxx) << getTypeName(OldTy) << getTypeName(Ty); } // If running with -fthinlto-index=, extensions that normally identify // native object files actually identify LLVM bitcode files. if (Args.hasArgNoClaim(options::OPT_fthinlto_index_EQ) && Ty == types::TY_Object) Ty = types::TY_LLVM_BC; } // -ObjC and -ObjC++ override the default language, but only for "source // files". We just treat everything that isn't a linker input as a // source file. // // FIXME: Clean this up if we move the phase sequence into the type. if (Ty != types::TY_Object) { if (Args.hasArg(options::OPT_ObjC)) Ty = types::TY_ObjC; else if (Args.hasArg(options::OPT_ObjCXX)) Ty = types::TY_ObjCXX; } // Disambiguate headers that are meant to be header units from those // intended to be PCH. Avoid missing '.h' cases that are counted as // C headers by default - we know we are in C++ mode and we do not // want to issue a complaint about compiling things in the wrong mode. if ((Ty == types::TY_CXXHeader || Ty == types::TY_CHeader) && hasHeaderMode()) Ty = CXXHeaderUnitType(CXX20HeaderType); } else { assert(InputTypeArg && "InputType set w/o InputTypeArg"); if (!InputTypeArg->getOption().matches(options::OPT_x)) { // If emulating cl.exe, make sure that /TC and /TP don't affect input // object files. const char *Ext = strrchr(Value, '.'); if (Ext && TC.LookupTypeForExtension(Ext + 1) == types::TY_Object) Ty = types::TY_Object; } if (Ty == types::TY_INVALID) { Ty = InputType; InputTypeArg->claim(); } } if (DiagnoseInputExistence(Args, Value, Ty, /*TypoCorrect=*/true)) Inputs.push_back(std::make_pair(Ty, A)); } else if (A->getOption().matches(options::OPT__SLASH_Tc)) { StringRef Value = A->getValue(); if (DiagnoseInputExistence(Args, Value, types::TY_C, /*TypoCorrect=*/false)) { Arg *InputArg = MakeInputArg(Args, Opts, A->getValue()); Inputs.push_back(std::make_pair(types::TY_C, InputArg)); } A->claim(); } else if (A->getOption().matches(options::OPT__SLASH_Tp)) { StringRef Value = A->getValue(); if (DiagnoseInputExistence(Args, Value, types::TY_CXX, /*TypoCorrect=*/false)) { Arg *InputArg = MakeInputArg(Args, Opts, A->getValue()); Inputs.push_back(std::make_pair(types::TY_CXX, InputArg)); } A->claim(); } else if (A->getOption().hasFlag(options::LinkerInput)) { // Just treat as object type, we could make a special type for this if // necessary. Inputs.push_back(std::make_pair(types::TY_Object, A)); } else if (A->getOption().matches(options::OPT_x)) { InputTypeArg = A; InputType = types::lookupTypeForTypeSpecifier(A->getValue()); A->claim(); // Follow gcc behavior and treat as linker input for invalid -x // options. Its not clear why we shouldn't just revert to unknown; but // this isn't very important, we might as well be bug compatible. if (!InputType) { Diag(clang::diag::err_drv_unknown_language) << A->getValue(); InputType = types::TY_Object; } // If the user has put -fmodule-header{,=} then we treat C++ headers as // header unit inputs. So we 'promote' -xc++-header appropriately. if (InputType == types::TY_CXXHeader && hasHeaderMode()) InputType = CXXHeaderUnitType(CXX20HeaderType); } else if (A->getOption().getID() == options::OPT_U) { assert(A->getNumValues() == 1 && "The /U option has one value."); StringRef Val = A->getValue(0); if (Val.find_first_of("/\\") != StringRef::npos) { // Warn about e.g. "/Users/me/myfile.c". Diag(diag::warn_slash_u_filename) << Val; Diag(diag::note_use_dashdash); } } } if (CCCIsCPP() && Inputs.empty()) { // If called as standalone preprocessor, stdin is processed // if no other input is present. Arg *A = MakeInputArg(Args, Opts, "-"); Inputs.push_back(std::make_pair(types::TY_C, A)); } } namespace { /// Provides a convenient interface for different programming models to generate /// the required device actions. class OffloadingActionBuilder final { /// Flag used to trace errors in the builder. bool IsValid = false; /// The compilation that is using this builder. Compilation &C; /// Map between an input argument and the offload kinds used to process it. std::map InputArgToOffloadKindMap; /// Map between a host action and its originating input argument. std::map HostActionToInputArgMap; /// Builder interface. It doesn't build anything or keep any state. class DeviceActionBuilder { public: typedef const llvm::SmallVectorImpl PhasesTy; enum ActionBuilderReturnCode { // The builder acted successfully on the current action. ABRT_Success, // The builder didn't have to act on the current action. ABRT_Inactive, // The builder was successful and requested the host action to not be // generated. ABRT_Ignore_Host, }; protected: /// Compilation associated with this builder. Compilation &C; /// Tool chains associated with this builder. The same programming /// model may have associated one or more tool chains. SmallVector ToolChains; /// The derived arguments associated with this builder. DerivedArgList &Args; /// The inputs associated with this builder. const Driver::InputList &Inputs; /// The associated offload kind. Action::OffloadKind AssociatedOffloadKind = Action::OFK_None; public: DeviceActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs, Action::OffloadKind AssociatedOffloadKind) : C(C), Args(Args), Inputs(Inputs), AssociatedOffloadKind(AssociatedOffloadKind) {} virtual ~DeviceActionBuilder() {} /// Fill up the array \a DA with all the device dependences that should be /// added to the provided host action \a HostAction. By default it is /// inactive. virtual ActionBuilderReturnCode getDeviceDependences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase, phases::ID FinalPhase, PhasesTy &Phases) { return ABRT_Inactive; } /// Update the state to include the provided host action \a HostAction as a /// dependency of the current device action. By default it is inactive. virtual ActionBuilderReturnCode addDeviceDependences(Action *HostAction) { return ABRT_Inactive; } /// Append top level actions generated by the builder. virtual void appendTopLevelActions(ActionList &AL) {} /// Append linker device actions generated by the builder. virtual void appendLinkDeviceActions(ActionList &AL) {} /// Append linker host action generated by the builder. virtual Action* appendLinkHostActions(ActionList &AL) { return nullptr; } /// Append linker actions generated by the builder. virtual void appendLinkDependences(OffloadAction::DeviceDependences &DA) {} /// Initialize the builder. Return true if any initialization errors are /// found. virtual bool initialize() { return false; } /// Return true if the builder can use bundling/unbundling. virtual bool canUseBundlerUnbundler() const { return false; } /// Return true if this builder is valid. We have a valid builder if we have /// associated device tool chains. bool isValid() { return !ToolChains.empty(); } /// Return the associated offload kind. Action::OffloadKind getAssociatedOffloadKind() { return AssociatedOffloadKind; } }; /// Base class for CUDA/HIP action builder. It injects device code in /// the host backend action. class CudaActionBuilderBase : public DeviceActionBuilder { protected: /// Flags to signal if the user requested host-only or device-only /// compilation. bool CompileHostOnly = false; bool CompileDeviceOnly = false; bool EmitLLVM = false; bool EmitAsm = false; /// ID to identify each device compilation. For CUDA it is simply the /// GPU arch string. For HIP it is either the GPU arch string or GPU /// arch string plus feature strings delimited by a plus sign, e.g. /// gfx906+xnack. struct TargetID { /// Target ID string which is persistent throughout the compilation. const char *ID; TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); } TargetID(const char *ID) : ID(ID) {} operator const char *() { return ID; } operator StringRef() { return StringRef(ID); } }; /// List of GPU architectures to use in this compilation. SmallVector GpuArchList; /// The CUDA actions for the current input. ActionList CudaDeviceActions; /// The CUDA fat binary if it was generated for the current input. Action *CudaFatBinary = nullptr; /// Flag that is set to true if this builder acted on the current input. bool IsActive = false; /// Flag for -fgpu-rdc. bool Relocatable = false; /// Default GPU architecture if there's no one specified. CudaArch DefaultCudaArch = CudaArch::UNKNOWN; /// Method to generate compilation unit ID specified by option /// '-fuse-cuid='. enum UseCUIDKind { CUID_Hash, CUID_Random, CUID_None, CUID_Invalid }; UseCUIDKind UseCUID = CUID_Hash; /// Compilation unit ID specified by option '-cuid='. StringRef FixedCUID; public: CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs, Action::OffloadKind OFKind) : DeviceActionBuilder(C, Args, Inputs, OFKind) { CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, /*Default=*/false); } ActionBuilderReturnCode addDeviceDependences(Action *HostAction) override { // While generating code for CUDA, we only depend on the host input action // to trigger the creation of all the CUDA device actions. // If we are dealing with an input action, replicate it for each GPU // architecture. If we are in host-only mode we return 'success' so that // the host uses the CUDA offload kind. if (auto *IA = dyn_cast(HostAction)) { assert(!GpuArchList.empty() && "We should have at least one GPU architecture."); // If the host input is not CUDA or HIP, we don't need to bother about // this input. if (!(IA->getType() == types::TY_CUDA || IA->getType() == types::TY_HIP || IA->getType() == types::TY_PP_HIP)) { // The builder will ignore this input. IsActive = false; return ABRT_Inactive; } // Set the flag to true, so that the builder acts on the current input. IsActive = true; if (CompileHostOnly) return ABRT_Success; // Replicate inputs for each GPU architecture. auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE : types::TY_CUDA_DEVICE; std::string CUID = FixedCUID.str(); if (CUID.empty()) { if (UseCUID == CUID_Random) CUID = llvm::utohexstr(llvm::sys::Process::GetRandomNumber(), /*LowerCase=*/true); else if (UseCUID == CUID_Hash) { llvm::MD5 Hasher; llvm::MD5::MD5Result Hash; SmallString<256> RealPath; llvm::sys::fs::real_path(IA->getInputArg().getValue(), RealPath, /*expand_tilde=*/true); Hasher.update(RealPath); for (auto *A : Args) { if (A->getOption().matches(options::OPT_INPUT)) continue; Hasher.update(A->getAsString(Args)); } Hasher.final(Hash); CUID = llvm::utohexstr(Hash.low(), /*LowerCase=*/true); } } IA->setId(CUID); for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { CudaDeviceActions.push_back( C.MakeAction(IA->getInputArg(), Ty, IA->getId())); } return ABRT_Success; } // If this is an unbundling action use it as is for each CUDA toolchain. if (auto *UA = dyn_cast(HostAction)) { // If -fgpu-rdc is disabled, should not unbundle since there is no // device code to link. if (UA->getType() == types::TY_Object && !Relocatable) return ABRT_Inactive; CudaDeviceActions.clear(); auto *IA = cast(UA->getInputs().back()); std::string FileName = IA->getInputArg().getAsString(Args); // Check if the type of the file is the same as the action. Do not // unbundle it if it is not. Do not unbundle .so files, for example, // which are not object files. Files with extension ".lib" is classified // as TY_Object but they are actually archives, therefore should not be // unbundled here as objects. They will be handled at other places. const StringRef LibFileExt = ".lib"; if (IA->getType() == types::TY_Object && (!llvm::sys::path::has_extension(FileName) || types::lookupTypeForExtension( llvm::sys::path::extension(FileName).drop_front()) != types::TY_Object || llvm::sys::path::extension(FileName) == LibFileExt)) return ABRT_Inactive; for (auto Arch : GpuArchList) { CudaDeviceActions.push_back(UA); UA->registerDependentActionInfo(ToolChains[0], Arch, AssociatedOffloadKind); } IsActive = true; return ABRT_Success; } return IsActive ? ABRT_Success : ABRT_Inactive; } void appendTopLevelActions(ActionList &AL) override { // Utility to append actions to the top level list. auto AddTopLevel = [&](Action *A, TargetID TargetID) { OffloadAction::DeviceDependences Dep; Dep.add(*A, *ToolChains.front(), TargetID, AssociatedOffloadKind); AL.push_back(C.MakeAction(Dep, A->getType())); }; // If we have a fat binary, add it to the list. if (CudaFatBinary) { AddTopLevel(CudaFatBinary, CudaArch::UNUSED); CudaDeviceActions.clear(); CudaFatBinary = nullptr; return; } if (CudaDeviceActions.empty()) return; // If we have CUDA actions at this point, that's because we have a have // partial compilation, so we should have an action for each GPU // architecture. assert(CudaDeviceActions.size() == GpuArchList.size() && "Expecting one action per GPU architecture."); assert(ToolChains.size() == 1 && "Expecting to have a single CUDA toolchain."); for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) AddTopLevel(CudaDeviceActions[I], GpuArchList[I]); CudaDeviceActions.clear(); } /// Get canonicalized offload arch option. \returns empty StringRef if the /// option is invalid. virtual StringRef getCanonicalOffloadArch(StringRef Arch) = 0; virtual std::optional> getConflictOffloadArchCombination(const std::set &GpuArchs) = 0; bool initialize() override { assert(AssociatedOffloadKind == Action::OFK_Cuda || AssociatedOffloadKind == Action::OFK_HIP); // We don't need to support CUDA. if (AssociatedOffloadKind == Action::OFK_Cuda && !C.hasOffloadToolChain()) return false; // We don't need to support HIP. if (AssociatedOffloadKind == Action::OFK_HIP && !C.hasOffloadToolChain()) return false; const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); if (HostTC->getTriple().isNVPTX() || HostTC->getTriple().getArch() == llvm::Triple::amdgcn) { // We do not support targeting NVPTX/AMDGCN for host compilation. Throw // an error and abort pipeline construction early so we don't trip // asserts that assume device-side compilation. C.getDriver().Diag(diag::err_drv_cuda_host_arch) << HostTC->getTriple().getArchName(); return true; } ToolChains.push_back( AssociatedOffloadKind == Action::OFK_Cuda ? C.getSingleOffloadToolChain() : C.getSingleOffloadToolChain()); CompileHostOnly = C.getDriver().offloadHostOnly(); EmitLLVM = Args.getLastArg(options::OPT_emit_llvm); EmitAsm = Args.getLastArg(options::OPT_S); FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); if (Arg *A = Args.getLastArg(options::OPT_fuse_cuid_EQ)) { StringRef UseCUIDStr = A->getValue(); UseCUID = llvm::StringSwitch(UseCUIDStr) .Case("hash", CUID_Hash) .Case("random", CUID_Random) .Case("none", CUID_None) .Default(CUID_Invalid); if (UseCUID == CUID_Invalid) { C.getDriver().Diag(diag::err_drv_invalid_value) << A->getAsString(Args) << UseCUIDStr; C.setContainsError(); return true; } } // --offload and --offload-arch options are mutually exclusive. if (Args.hasArgNoClaim(options::OPT_offload_EQ) && Args.hasArgNoClaim(options::OPT_offload_arch_EQ, options::OPT_no_offload_arch_EQ)) { C.getDriver().Diag(diag::err_opt_not_valid_with_opt) << "--offload-arch" << "--offload"; } // Collect all offload arch parameters, removing duplicates. std::set GpuArchs; bool Error = false; for (Arg *A : Args) { if (!(A->getOption().matches(options::OPT_offload_arch_EQ) || A->getOption().matches(options::OPT_no_offload_arch_EQ))) continue; A->claim(); for (StringRef ArchStr : llvm::split(A->getValue(), ",")) { if (A->getOption().matches(options::OPT_no_offload_arch_EQ) && ArchStr == "all") { GpuArchs.clear(); } else if (ArchStr == "native") { const ToolChain &TC = *ToolChains.front(); auto GPUsOrErr = ToolChains.front()->getSystemGPUArchs(Args); if (!GPUsOrErr) { TC.getDriver().Diag(diag::err_drv_undetermined_gpu_arch) << llvm::Triple::getArchTypeName(TC.getArch()) << llvm::toString(GPUsOrErr.takeError()) << "--offload-arch"; continue; } for (auto GPU : *GPUsOrErr) { GpuArchs.insert(Args.MakeArgString(GPU)); } } else { ArchStr = getCanonicalOffloadArch(ArchStr); if (ArchStr.empty()) { Error = true; } else if (A->getOption().matches(options::OPT_offload_arch_EQ)) GpuArchs.insert(ArchStr); else if (A->getOption().matches(options::OPT_no_offload_arch_EQ)) GpuArchs.erase(ArchStr); else llvm_unreachable("Unexpected option."); } } } auto &&ConflictingArchs = getConflictOffloadArchCombination(GpuArchs); if (ConflictingArchs) { C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo) << ConflictingArchs->first << ConflictingArchs->second; C.setContainsError(); return true; } // Collect list of GPUs remaining in the set. for (auto Arch : GpuArchs) GpuArchList.push_back(Arch.data()); // Default to sm_20 which is the lowest common denominator for // supported GPUs. sm_20 code should work correctly, if // suboptimally, on all newer GPUs. if (GpuArchList.empty()) { if (ToolChains.front()->getTriple().isSPIRV()) GpuArchList.push_back(CudaArch::Generic); else GpuArchList.push_back(DefaultCudaArch); } return Error; } }; /// \brief CUDA action builder. It injects device code in the host backend /// action. class CudaActionBuilder final : public CudaActionBuilderBase { public: CudaActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) { DefaultCudaArch = CudaArch::SM_35; } StringRef getCanonicalOffloadArch(StringRef ArchStr) override { CudaArch Arch = StringToCudaArch(ArchStr); if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) { C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr; return StringRef(); } return CudaArchToString(Arch); } std::optional> getConflictOffloadArchCombination( const std::set &GpuArchs) override { return std::nullopt; } ActionBuilderReturnCode getDeviceDependences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase, phases::ID FinalPhase, PhasesTy &Phases) override { if (!IsActive) return ABRT_Inactive; // If we don't have more CUDA actions, we don't have any dependences to // create for the host. if (CudaDeviceActions.empty()) return ABRT_Success; assert(CudaDeviceActions.size() == GpuArchList.size() && "Expecting one action per GPU architecture."); assert(!CompileHostOnly && "Not expecting CUDA actions in host-only compilation."); // If we are generating code for the device or we are in a backend phase, // we attempt to generate the fat binary. We compile each arch to ptx and // assemble to cubin, then feed the cubin *and* the ptx into a device // "link" action, which uses fatbinary to combine these cubins into one // fatbin. The fatbin is then an input to the host action if not in // device-only mode. if (CompileDeviceOnly || CurPhase == phases::Backend) { ActionList DeviceActions; for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { // Produce the device action from the current phase up to the assemble // phase. for (auto Ph : Phases) { // Skip the phases that were already dealt with. if (Ph < CurPhase) continue; // We have to be consistent with the host final phase. if (Ph > FinalPhase) break; CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction( C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda); if (Ph == phases::Assemble) break; } // If we didn't reach the assemble phase, we can't generate the fat // binary. We don't need to generate the fat binary if we are not in // device-only mode. if (!isa(CudaDeviceActions[I]) || CompileDeviceOnly) continue; Action *AssembleAction = CudaDeviceActions[I]; assert(AssembleAction->getType() == types::TY_Object); assert(AssembleAction->getInputs().size() == 1); Action *BackendAction = AssembleAction->getInputs()[0]; assert(BackendAction->getType() == types::TY_PP_Asm); for (auto &A : {AssembleAction, BackendAction}) { OffloadAction::DeviceDependences DDep; DDep.add(*A, *ToolChains.front(), GpuArchList[I], Action::OFK_Cuda); DeviceActions.push_back( C.MakeAction(DDep, A->getType())); } } // We generate the fat binary if we have device input actions. if (!DeviceActions.empty()) { CudaFatBinary = C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); if (!CompileDeviceOnly) { DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, Action::OFK_Cuda); // Clear the fat binary, it is already a dependence to an host // action. CudaFatBinary = nullptr; } // Remove the CUDA actions as they are already connected to an host // action or fat binary. CudaDeviceActions.clear(); } // We avoid creating host action in device-only mode. return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; } else if (CurPhase > phases::Backend) { // If we are past the backend phase and still have a device action, we // don't have to do anything as this action is already a device // top-level action. return ABRT_Success; } assert(CurPhase < phases::Backend && "Generating single CUDA " "instructions should only occur " "before the backend phase!"); // By default, we produce an action for each device arch. for (Action *&A : CudaDeviceActions) A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A); return ABRT_Success; } }; /// \brief HIP action builder. It injects device code in the host backend /// action. class HIPActionBuilder final : public CudaActionBuilderBase { /// The linker inputs obtained for each device arch. SmallVector DeviceLinkerInputs; // The default bundling behavior depends on the type of output, therefore // BundleOutput needs to be tri-value: None, true, or false. // Bundle code objects except --no-gpu-output is specified for device // only compilation. Bundle other type of output files only if // --gpu-bundle-output is specified for device only compilation. std::optional BundleOutput; std::optional EmitReloc; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) { DefaultCudaArch = CudaArch::GFX906; if (Args.hasArg(options::OPT_fhip_emit_relocatable, options::OPT_fno_hip_emit_relocatable)) { EmitReloc = Args.hasFlag(options::OPT_fhip_emit_relocatable, options::OPT_fno_hip_emit_relocatable, false); if (*EmitReloc) { if (Relocatable) { C.getDriver().Diag(diag::err_opt_not_valid_with_opt) << "-fhip-emit-relocatable" << "-fgpu-rdc"; } if (!CompileDeviceOnly) { C.getDriver().Diag(diag::err_opt_not_valid_without_opt) << "-fhip-emit-relocatable" << "--cuda-device-only"; } } } if (Args.hasArg(options::OPT_gpu_bundle_output, options::OPT_no_gpu_bundle_output)) BundleOutput = Args.hasFlag(options::OPT_gpu_bundle_output, options::OPT_no_gpu_bundle_output, true) && (!EmitReloc || !*EmitReloc); } bool canUseBundlerUnbundler() const override { return true; } StringRef getCanonicalOffloadArch(StringRef IdStr) override { llvm::StringMap Features; // getHIPOffloadTargetTriple() is known to return valid value as it has // been called successfully in the CreateOffloadingDeviceToolChains(). auto ArchStr = parseTargetID( *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), IdStr, &Features); if (!ArchStr) { C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << IdStr; C.setContainsError(); return StringRef(); } auto CanId = getCanonicalTargetID(*ArchStr, Features); return Args.MakeArgStringRef(CanId); }; std::optional> getConflictOffloadArchCombination( const std::set &GpuArchs) override { return getConflictTargetIDCombination(GpuArchs); } ActionBuilderReturnCode getDeviceDependences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase, phases::ID FinalPhase, PhasesTy &Phases) override { if (!IsActive) return ABRT_Inactive; // amdgcn does not support linking of object files, therefore we skip // backend and assemble phases to output LLVM IR. Except for generating // non-relocatable device code, where we generate fat binary for device // code and pass to host in Backend phase. if (CudaDeviceActions.empty()) return ABRT_Success; assert(((CurPhase == phases::Link && Relocatable) || CudaDeviceActions.size() == GpuArchList.size()) && "Expecting one action per GPU architecture."); assert(!CompileHostOnly && "Not expecting HIP actions in host-only compilation."); bool ShouldLink = !EmitReloc || !*EmitReloc; if (!Relocatable && CurPhase == phases::Backend && !EmitLLVM && !EmitAsm && ShouldLink) { // If we are in backend phase, we attempt to generate the fat binary. // We compile each arch to IR and use a link action to generate code // object containing ISA. Then we use a special "link" action to create // a fat binary containing all the code objects for different GPU's. // The fat binary is then an input to the host action. for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { if (C.getDriver().isUsingLTO(/*IsOffload=*/true)) { // When LTO is enabled, skip the backend and assemble phases and // use lld to link the bitcode. ActionList AL; AL.push_back(CudaDeviceActions[I]); // Create a link action to link device IR with device library // and generate ISA. CudaDeviceActions[I] = C.MakeAction(AL, types::TY_Image); } else { // When LTO is not enabled, we follow the conventional // compiler phases, including backend and assemble phases. ActionList AL; Action *BackendAction = nullptr; if (ToolChains.front()->getTriple().isSPIRV()) { // Emit LLVM bitcode for SPIR-V targets. SPIR-V device tool chain // (HIPSPVToolChain) runs post-link LLVM IR passes. types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LLVM_IR : types::TY_LLVM_BC; BackendAction = C.MakeAction(CudaDeviceActions[I], Output); } else BackendAction = C.getDriver().ConstructPhaseAction( C, Args, phases::Backend, CudaDeviceActions[I], AssociatedOffloadKind); auto AssembleAction = C.getDriver().ConstructPhaseAction( C, Args, phases::Assemble, BackendAction, AssociatedOffloadKind); AL.push_back(AssembleAction); // Create a link action to link device IR with device library // and generate ISA. CudaDeviceActions[I] = C.MakeAction(AL, types::TY_Image); } // OffloadingActionBuilder propagates device arch until an offload // action. Since the next action for creating fatbin does // not have device arch, whereas the above link action and its input // have device arch, an offload action is needed to stop the null // device arch of the next action being propagated to the above link // action. OffloadAction::DeviceDependences DDep; DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I], AssociatedOffloadKind); CudaDeviceActions[I] = C.MakeAction( DDep, CudaDeviceActions[I]->getType()); } if (!CompileDeviceOnly || !BundleOutput || *BundleOutput) { // Create HIP fat binary with a special "link" action. CudaFatBinary = C.MakeAction(CudaDeviceActions, types::TY_HIP_FATBIN); if (!CompileDeviceOnly) { DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, AssociatedOffloadKind); // Clear the fat binary, it is already a dependence to an host // action. CudaFatBinary = nullptr; } // Remove the CUDA actions as they are already connected to an host // action or fat binary. CudaDeviceActions.clear(); } return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; } else if (CurPhase == phases::Link) { if (!ShouldLink) return ABRT_Success; // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. // This happens to each device action originated from each input file. // Later on, device actions in DeviceLinkerInputs are used to create // device link actions in appendLinkDependences and the created device // link actions are passed to the offload action as device dependence. DeviceLinkerInputs.resize(CudaDeviceActions.size()); auto LI = DeviceLinkerInputs.begin(); for (auto *A : CudaDeviceActions) { LI->push_back(A); ++LI; } // We will pass the device action as a host dependence, so we don't // need to do anything else with them. CudaDeviceActions.clear(); return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; } // By default, we produce an action for each device arch. for (Action *&A : CudaDeviceActions) A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, AssociatedOffloadKind); if (CompileDeviceOnly && CurPhase == FinalPhase && BundleOutput && *BundleOutput) { for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { OffloadAction::DeviceDependences DDep; DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I], AssociatedOffloadKind); CudaDeviceActions[I] = C.MakeAction( DDep, CudaDeviceActions[I]->getType()); } CudaFatBinary = C.MakeAction(CudaDeviceActions); CudaDeviceActions.clear(); } return (CompileDeviceOnly && (CurPhase == FinalPhase || (!ShouldLink && CurPhase == phases::Assemble))) ? ABRT_Ignore_Host : ABRT_Success; } void appendLinkDeviceActions(ActionList &AL) override { if (DeviceLinkerInputs.size() == 0) return; assert(DeviceLinkerInputs.size() == GpuArchList.size() && "Linker inputs and GPU arch list sizes do not match."); ActionList Actions; unsigned I = 0; // Append a new link action for each device. // Each entry in DeviceLinkerInputs corresponds to a GPU arch. for (auto &LI : DeviceLinkerInputs) { types::ID Output = Args.hasArg(options::OPT_emit_llvm) ? types::TY_LLVM_BC : types::TY_Image; auto *DeviceLinkAction = C.MakeAction(LI, Output); // Linking all inputs for the current GPU arch. // LI contains all the inputs for the linker. OffloadAction::DeviceDependences DeviceLinkDeps; DeviceLinkDeps.add(*DeviceLinkAction, *ToolChains[0], GpuArchList[I], AssociatedOffloadKind); Actions.push_back(C.MakeAction( DeviceLinkDeps, DeviceLinkAction->getType())); ++I; } DeviceLinkerInputs.clear(); // If emitting LLVM, do not generate final host/device compilation action if (Args.hasArg(options::OPT_emit_llvm)) { AL.append(Actions); return; } // Create a host object from all the device images by embedding them // in a fat binary for mixed host-device compilation. For device-only // compilation, creates a fat binary. OffloadAction::DeviceDependences DDeps; if (!CompileDeviceOnly || !BundleOutput || *BundleOutput) { auto *TopDeviceLinkAction = C.MakeAction( Actions, CompileDeviceOnly ? types::TY_HIP_FATBIN : types::TY_Object); DDeps.add(*TopDeviceLinkAction, *ToolChains[0], nullptr, AssociatedOffloadKind); // Offload the host object to the host linker. AL.push_back( C.MakeAction(DDeps, TopDeviceLinkAction->getType())); } else { AL.append(Actions); } } Action* appendLinkHostActions(ActionList &AL) override { return AL.back(); } void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {} }; /// /// TODO: Add the implementation for other specialized builders here. /// /// Specialized builders being used by this offloading action builder. SmallVector SpecializedBuilders; /// Flag set to true if all valid builders allow file bundling/unbundling. bool CanUseBundler; public: OffloadingActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) : C(C) { // Create a specialized builder for each device toolchain. IsValid = true; // Create a specialized builder for CUDA. SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs)); // Create a specialized builder for HIP. SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs)); // // TODO: Build other specialized builders here. // // Initialize all the builders, keeping track of errors. If all valid // builders agree that we can use bundling, set the flag to true. unsigned ValidBuilders = 0u; unsigned ValidBuildersSupportingBundling = 0u; for (auto *SB : SpecializedBuilders) { IsValid = IsValid && !SB->initialize(); // Update the counters if the builder is valid. if (SB->isValid()) { ++ValidBuilders; if (SB->canUseBundlerUnbundler()) ++ValidBuildersSupportingBundling; } } CanUseBundler = ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling; } ~OffloadingActionBuilder() { for (auto *SB : SpecializedBuilders) delete SB; } /// Record a host action and its originating input argument. void recordHostAction(Action *HostAction, const Arg *InputArg) { assert(HostAction && "Invalid host action"); assert(InputArg && "Invalid input argument"); auto Loc = HostActionToInputArgMap.find(HostAction); if (Loc == HostActionToInputArgMap.end()) HostActionToInputArgMap[HostAction] = InputArg; assert(HostActionToInputArgMap[HostAction] == InputArg && "host action mapped to multiple input arguments"); } /// Generate an action that adds device dependences (if any) to a host action. /// If no device dependence actions exist, just return the host action \a /// HostAction. If an error is found or if no builder requires the host action /// to be generated, return nullptr. Action * addDeviceDependencesToHostAction(Action *HostAction, const Arg *InputArg, phases::ID CurPhase, phases::ID FinalPhase, DeviceActionBuilder::PhasesTy &Phases) { if (!IsValid) return nullptr; if (SpecializedBuilders.empty()) return HostAction; assert(HostAction && "Invalid host action!"); recordHostAction(HostAction, InputArg); OffloadAction::DeviceDependences DDeps; // Check if all the programming models agree we should not emit the host // action. Also, keep track of the offloading kinds employed. auto &OffloadKind = InputArgToOffloadKindMap[InputArg]; unsigned InactiveBuilders = 0u; unsigned IgnoringBuilders = 0u; for (auto *SB : SpecializedBuilders) { if (!SB->isValid()) { ++InactiveBuilders; continue; } auto RetCode = SB->getDeviceDependences(DDeps, CurPhase, FinalPhase, Phases); // If the builder explicitly says the host action should be ignored, // we need to increment the variable that tracks the builders that request // the host object to be ignored. if (RetCode == DeviceActionBuilder::ABRT_Ignore_Host) ++IgnoringBuilders; // Unless the builder was inactive for this action, we have to record the // offload kind because the host will have to use it. if (RetCode != DeviceActionBuilder::ABRT_Inactive) OffloadKind |= SB->getAssociatedOffloadKind(); } // If all builders agree that the host object should be ignored, just return // nullptr. if (IgnoringBuilders && SpecializedBuilders.size() == (InactiveBuilders + IgnoringBuilders)) return nullptr; if (DDeps.getActions().empty()) return HostAction; // We have dependences we need to bundle together. We use an offload action // for that. OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), /*BoundArch=*/nullptr, DDeps); return C.MakeAction(HDep, DDeps); } /// Generate an action that adds a host dependence to a device action. The /// results will be kept in this action builder. Return true if an error was /// found. bool addHostDependenceToDeviceActions(Action *&HostAction, const Arg *InputArg) { if (!IsValid) return true; recordHostAction(HostAction, InputArg); // If we are supporting bundling/unbundling and the current action is an // input action of non-source file, we replace the host action by the // unbundling action. The bundler tool has the logic to detect if an input // is a bundle or not and if the input is not a bundle it assumes it is a // host file. Therefore it is safe to create an unbundling action even if // the input is not a bundle. if (CanUseBundler && isa(HostAction) && InputArg->getOption().getKind() == llvm::opt::Option::InputClass && (!types::isSrcFile(HostAction->getType()) || HostAction->getType() == types::TY_PP_HIP)) { auto UnbundlingHostAction = C.MakeAction(HostAction); UnbundlingHostAction->registerDependentActionInfo( C.getSingleOffloadToolChain(), /*BoundArch=*/StringRef(), Action::OFK_Host); HostAction = UnbundlingHostAction; recordHostAction(HostAction, InputArg); } assert(HostAction && "Invalid host action!"); // Register the offload kinds that are used. auto &OffloadKind = InputArgToOffloadKindMap[InputArg]; for (auto *SB : SpecializedBuilders) { if (!SB->isValid()) continue; auto RetCode = SB->addDeviceDependences(HostAction); // Host dependences for device actions are not compatible with that same // action being ignored. assert(RetCode != DeviceActionBuilder::ABRT_Ignore_Host && "Host dependence not expected to be ignored.!"); // Unless the builder was inactive for this action, we have to record the // offload kind because the host will have to use it. if (RetCode != DeviceActionBuilder::ABRT_Inactive) OffloadKind |= SB->getAssociatedOffloadKind(); } // Do not use unbundler if the Host does not depend on device action. if (OffloadKind == Action::OFK_None && CanUseBundler) if (auto *UA = dyn_cast(HostAction)) HostAction = UA->getInputs().back(); return false; } /// Add the offloading top level actions to the provided action list. This /// function can replace the host action by a bundling action if the /// programming models allow it. bool appendTopLevelActions(ActionList &AL, Action *HostAction, const Arg *InputArg) { if (HostAction) recordHostAction(HostAction, InputArg); // Get the device actions to be appended. ActionList OffloadAL; for (auto *SB : SpecializedBuilders) { if (!SB->isValid()) continue; SB->appendTopLevelActions(OffloadAL); } // If we can use the bundler, replace the host action by the bundling one in // the resulting list. Otherwise, just append the device actions. For // device only compilation, HostAction is a null pointer, therefore only do // this when HostAction is not a null pointer. if (CanUseBundler && HostAction && HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) { // Add the host action to the list in order to create the bundling action. OffloadAL.push_back(HostAction); // We expect that the host action was just appended to the action list // before this method was called. assert(HostAction == AL.back() && "Host action not in the list??"); HostAction = C.MakeAction(OffloadAL); recordHostAction(HostAction, InputArg); AL.back() = HostAction; } else AL.append(OffloadAL.begin(), OffloadAL.end()); // Propagate to the current host action (if any) the offload information // associated with the current input. if (HostAction) HostAction->propagateHostOffloadInfo(InputArgToOffloadKindMap[InputArg], /*BoundArch=*/nullptr); return false; } void appendDeviceLinkActions(ActionList &AL) { for (DeviceActionBuilder *SB : SpecializedBuilders) { if (!SB->isValid()) continue; SB->appendLinkDeviceActions(AL); } } Action *makeHostLinkAction() { // Build a list of device linking actions. ActionList DeviceAL; appendDeviceLinkActions(DeviceAL); if (DeviceAL.empty()) return nullptr; // Let builders add host linking actions. Action* HA = nullptr; for (DeviceActionBuilder *SB : SpecializedBuilders) { if (!SB->isValid()) continue; HA = SB->appendLinkHostActions(DeviceAL); // This created host action has no originating input argument, therefore // needs to set its offloading kind directly. if (HA) HA->propagateHostOffloadInfo(SB->getAssociatedOffloadKind(), /*BoundArch=*/nullptr); } return HA; } /// Processes the host linker action. This currently consists of replacing it /// with an offload action if there are device link objects and propagate to /// the host action all the offload kinds used in the current compilation. The /// resulting action is returned. Action *processHostLinkAction(Action *HostAction) { // Add all the dependences from the device linking actions. OffloadAction::DeviceDependences DDeps; for (auto *SB : SpecializedBuilders) { if (!SB->isValid()) continue; SB->appendLinkDependences(DDeps); } // Calculate all the offload kinds used in the current compilation. unsigned ActiveOffloadKinds = 0u; for (auto &I : InputArgToOffloadKindMap) ActiveOffloadKinds |= I.second; // If we don't have device dependencies, we don't have to create an offload // action. if (DDeps.getActions().empty()) { // Set all the active offloading kinds to the link action. Given that it // is a link action it is assumed to depend on all actions generated so // far. HostAction->setHostOffloadInfo(ActiveOffloadKinds, /*BoundArch=*/nullptr); // Propagate active offloading kinds for each input to the link action. // Each input may have different active offloading kind. for (auto *A : HostAction->inputs()) { auto ArgLoc = HostActionToInputArgMap.find(A); if (ArgLoc == HostActionToInputArgMap.end()) continue; auto OFKLoc = InputArgToOffloadKindMap.find(ArgLoc->second); if (OFKLoc == InputArgToOffloadKindMap.end()) continue; A->propagateHostOffloadInfo(OFKLoc->second, /*BoundArch=*/nullptr); } return HostAction; } // Create the offload action with all dependences. When an offload action // is created the kinds are propagated to the host action, so we don't have // to do that explicitly here. OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), /*BoundArch*/ nullptr, ActiveOffloadKinds); return C.MakeAction(HDep, DDeps); } }; } // anonymous namespace. void Driver::handleArguments(Compilation &C, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { // Ignore /Yc/Yu if both /Yc and /Yu passed but with different filenames. Arg *YcArg = Args.getLastArg(options::OPT__SLASH_Yc); Arg *YuArg = Args.getLastArg(options::OPT__SLASH_Yu); if (YcArg && YuArg && strcmp(YcArg->getValue(), YuArg->getValue()) != 0) { Diag(clang::diag::warn_drv_ycyu_different_arg_clang_cl); Args.eraseArg(options::OPT__SLASH_Yc); Args.eraseArg(options::OPT__SLASH_Yu); YcArg = YuArg = nullptr; } if (YcArg && Inputs.size() > 1) { Diag(clang::diag::warn_drv_yc_multiple_inputs_clang_cl); Args.eraseArg(options::OPT__SLASH_Yc); YcArg = nullptr; } Arg *FinalPhaseArg; phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg); if (FinalPhase == phases::Link) { // Emitting LLVM while linking disabled except in HIPAMD Toolchain if (Args.hasArg(options::OPT_emit_llvm) && !Args.hasArg(options::OPT_hip_link)) Diag(clang::diag::err_drv_emit_llvm_link); if (IsCLMode() && LTOMode != LTOK_None && !Args.getLastArgValue(options::OPT_fuse_ld_EQ) .equals_insensitive("lld")) Diag(clang::diag::err_drv_lto_without_lld); // If -dumpdir is not specified, give a default prefix derived from the link // output filename. For example, `clang -g -gsplit-dwarf a.c -o x` passes // `-dumpdir x-` to cc1. If -o is unspecified, use // stem(getDefaultImageName()) (usually stem("a.out") = "a"). if (!Args.hasArg(options::OPT_dumpdir)) { Arg *FinalOutput = Args.getLastArg(options::OPT_o, options::OPT__SLASH_o); Arg *Arg = Args.MakeSeparateArg( nullptr, getOpts().getOption(options::OPT_dumpdir), Args.MakeArgString( (FinalOutput ? FinalOutput->getValue() : llvm::sys::path::stem(getDefaultImageName())) + "-")); Arg->claim(); Args.append(Arg); } } if (FinalPhase == phases::Preprocess || Args.hasArg(options::OPT__SLASH_Y_)) { // If only preprocessing or /Y- is used, all pch handling is disabled. // Rather than check for it everywhere, just remove clang-cl pch-related // flags here. Args.eraseArg(options::OPT__SLASH_Fp); Args.eraseArg(options::OPT__SLASH_Yc); Args.eraseArg(options::OPT__SLASH_Yu); YcArg = YuArg = nullptr; } unsigned LastPLSize = 0; for (auto &I : Inputs) { types::ID InputType = I.first; const Arg *InputArg = I.second; auto PL = types::getCompilationPhases(InputType); LastPLSize = PL.size(); // If the first step comes after the final phase we are doing as part of // this compilation, warn the user about it. phases::ID InitialPhase = PL[0]; if (InitialPhase > FinalPhase) { if (InputArg->isClaimed()) continue; // Claim here to avoid the more general unused warning. InputArg->claim(); // Suppress all unused style warnings with -Qunused-arguments if (Args.hasArg(options::OPT_Qunused_arguments)) continue; // Special case when final phase determined by binary name, rather than // by a command-line argument with a corresponding Arg. if (CCCIsCPP()) Diag(clang::diag::warn_drv_input_file_unused_by_cpp) << InputArg->getAsString(Args) << getPhaseName(InitialPhase); // Special case '-E' warning on a previously preprocessed file to make // more sense. else if (InitialPhase == phases::Compile && (Args.getLastArg(options::OPT__SLASH_EP, options::OPT__SLASH_P) || Args.getLastArg(options::OPT_E) || Args.getLastArg(options::OPT_M, options::OPT_MM)) && getPreprocessedType(InputType) == types::TY_INVALID) Diag(clang::diag::warn_drv_preprocessed_input_file_unused) << InputArg->getAsString(Args) << !!FinalPhaseArg << (FinalPhaseArg ? FinalPhaseArg->getOption().getName() : ""); else Diag(clang::diag::warn_drv_input_file_unused) << InputArg->getAsString(Args) << getPhaseName(InitialPhase) << !!FinalPhaseArg << (FinalPhaseArg ? FinalPhaseArg->getOption().getName() : ""); continue; } if (YcArg) { // Add a separate precompile phase for the compile phase. if (FinalPhase >= phases::Compile) { const types::ID HeaderType = lookupHeaderTypeForSourceType(InputType); // Build the pipeline for the pch file. Action *ClangClPch = C.MakeAction(*InputArg, HeaderType); for (phases::ID Phase : types::getCompilationPhases(HeaderType)) ClangClPch = ConstructPhaseAction(C, Args, Phase, ClangClPch); assert(ClangClPch); Actions.push_back(ClangClPch); // The driver currently exits after the first failed command. This // relies on that behavior, to make sure if the pch generation fails, // the main compilation won't run. // FIXME: If the main compilation fails, the PCH generation should // probably not be considered successful either. } } } // If we are linking, claim any options which are obviously only used for // compilation. // FIXME: Understand why the last Phase List length is used here. if (FinalPhase == phases::Link && LastPLSize == 1) { Args.ClaimAllArgs(options::OPT_CompileOnly_Group); Args.ClaimAllArgs(options::OPT_cl_compile_Group); } } void Driver::BuildActions(Compilation &C, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { llvm::PrettyStackTraceString CrashInfo("Building compilation actions"); if (!SuppressMissingInputWarning && Inputs.empty()) { Diag(clang::diag::err_drv_no_input_files); return; } // Diagnose misuse of /Fo. if (Arg *A = Args.getLastArg(options::OPT__SLASH_Fo)) { StringRef V = A->getValue(); if (Inputs.size() > 1 && !V.empty() && !llvm::sys::path::is_separator(V.back())) { // Check whether /Fo tries to name an output file for multiple inputs. Diag(clang::diag::err_drv_out_file_argument_with_multiple_sources) << A->getSpelling() << V; Args.eraseArg(options::OPT__SLASH_Fo); } } // Diagnose misuse of /Fa. if (Arg *A = Args.getLastArg(options::OPT__SLASH_Fa)) { StringRef V = A->getValue(); if (Inputs.size() > 1 && !V.empty() && !llvm::sys::path::is_separator(V.back())) { // Check whether /Fa tries to name an asm file for multiple inputs. Diag(clang::diag::err_drv_out_file_argument_with_multiple_sources) << A->getSpelling() << V; Args.eraseArg(options::OPT__SLASH_Fa); } } // Diagnose misuse of /o. if (Arg *A = Args.getLastArg(options::OPT__SLASH_o)) { if (A->getValue()[0] == '\0') { // It has to have a value. Diag(clang::diag::err_drv_missing_argument) << A->getSpelling() << 1; Args.eraseArg(options::OPT__SLASH_o); } } handleArguments(C, Args, Inputs, Actions); bool UseNewOffloadingDriver = C.isOffloadingHostKind(Action::OFK_OpenMP) || Args.hasFlag(options::OPT_offload_new_driver, options::OPT_no_offload_new_driver, false); // Builder to be used to build offloading actions. std::unique_ptr OffloadBuilder = !UseNewOffloadingDriver ? std::make_unique(C, Args, Inputs) : nullptr; // Construct the actions to perform. ExtractAPIJobAction *ExtractAPIAction = nullptr; ActionList LinkerInputs; ActionList MergerInputs; for (auto &I : Inputs) { types::ID InputType = I.first; const Arg *InputArg = I.second; auto PL = types::getCompilationPhases(*this, Args, InputType); if (PL.empty()) continue; auto FullPL = types::getCompilationPhases(InputType); // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); // Use the current host action in any of the offloading actions, if // required. if (!UseNewOffloadingDriver) if (OffloadBuilder->addHostDependenceToDeviceActions(Current, InputArg)) break; for (phases::ID Phase : PL) { // Add any offload action the host action depends on. if (!UseNewOffloadingDriver) Current = OffloadBuilder->addDeviceDependencesToHostAction( Current, InputArg, Phase, PL.back(), FullPL); if (!Current) break; // Queue linker inputs. if (Phase == phases::Link) { assert(Phase == PL.back() && "linking must be final compilation step."); // We don't need to generate additional link commands if emitting AMD // bitcode or compiling only for the offload device if (!(C.getInputArgs().hasArg(options::OPT_hip_link) && (C.getInputArgs().hasArg(options::OPT_emit_llvm))) && !offloadDeviceOnly()) LinkerInputs.push_back(Current); Current = nullptr; break; } // TODO: Consider removing this because the merged may not end up being // the final Phase in the pipeline. Perhaps the merged could just merge // and then pass an artifact of some sort to the Link Phase. // Queue merger inputs. if (Phase == phases::IfsMerge) { assert(Phase == PL.back() && "merging must be final compilation step."); MergerInputs.push_back(Current); Current = nullptr; break; } if (Phase == phases::Precompile && ExtractAPIAction) { ExtractAPIAction->addHeaderInput(Current); Current = nullptr; break; } // FIXME: Should we include any prior module file outputs as inputs of // later actions in the same command line? // Otherwise construct the appropriate action. Action *NewCurrent = ConstructPhaseAction(C, Args, Phase, Current); // We didn't create a new action, so we will just move to the next phase. if (NewCurrent == Current) continue; if (auto *EAA = dyn_cast(NewCurrent)) ExtractAPIAction = EAA; Current = NewCurrent; // Try to build the offloading actions and add the result as a dependency // to the host. if (UseNewOffloadingDriver) Current = BuildOffloadingActions(C, Args, I, Current); // Use the current host action in any of the offloading actions, if // required. else if (OffloadBuilder->addHostDependenceToDeviceActions(Current, InputArg)) break; if (Current->getType() == types::TY_Nothing) break; } // If we ended with something, add to the output list. if (Current) Actions.push_back(Current); // Add any top level actions generated for offloading. if (!UseNewOffloadingDriver) OffloadBuilder->appendTopLevelActions(Actions, Current, InputArg); else if (Current) Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(), /*BoundArch=*/nullptr); } // Add a link action if necessary. if (LinkerInputs.empty()) { Arg *FinalPhaseArg; if (getFinalPhase(Args, &FinalPhaseArg) == phases::Link) if (!UseNewOffloadingDriver) OffloadBuilder->appendDeviceLinkActions(Actions); } if (!LinkerInputs.empty()) { if (!UseNewOffloadingDriver) if (Action *Wrapper = OffloadBuilder->makeHostLinkAction()) LinkerInputs.push_back(Wrapper); Action *LA; // Check if this Linker Job should emit a static library. if (ShouldEmitStaticLibrary(Args)) { LA = C.MakeAction(LinkerInputs, types::TY_Image); } else if (UseNewOffloadingDriver || Args.hasArg(options::OPT_offload_link)) { LA = C.MakeAction(LinkerInputs, types::TY_Image); LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(), /*BoundArch=*/nullptr); } else { LA = C.MakeAction(LinkerInputs, types::TY_Image); } if (!UseNewOffloadingDriver) LA = OffloadBuilder->processHostLinkAction(LA); Actions.push_back(LA); } // Add an interface stubs merge action if necessary. if (!MergerInputs.empty()) Actions.push_back( C.MakeAction(MergerInputs, types::TY_Image)); if (Args.hasArg(options::OPT_emit_interface_stubs)) { auto PhaseList = types::getCompilationPhases( types::TY_IFS_CPP, Args.hasArg(options::OPT_c) ? phases::Compile : phases::IfsMerge); ActionList MergerInputs; for (auto &I : Inputs) { types::ID InputType = I.first; const Arg *InputArg = I.second; // Currently clang and the llvm assembler do not support generating symbol // stubs from assembly, so we skip the input on asm files. For ifs files // we rely on the normal pipeline setup in the pipeline setup code above. if (InputType == types::TY_IFS || InputType == types::TY_PP_Asm || InputType == types::TY_Asm) continue; Action *Current = C.MakeAction(*InputArg, InputType); for (auto Phase : PhaseList) { switch (Phase) { default: llvm_unreachable( "IFS Pipeline can only consist of Compile followed by IfsMerge."); case phases::Compile: { // Only IfsMerge (llvm-ifs) can handle .o files by looking for ifs // files where the .o file is located. The compile action can not // handle this. if (InputType == types::TY_Object) break; Current = C.MakeAction(Current, types::TY_IFS_CPP); break; } case phases::IfsMerge: { assert(Phase == PhaseList.back() && "merging must be final compilation step."); MergerInputs.push_back(Current); Current = nullptr; break; } } } // If we ended with something, add to the output list. if (Current) Actions.push_back(Current); } // Add an interface stubs merge action if necessary. if (!MergerInputs.empty()) Actions.push_back( C.MakeAction(MergerInputs, types::TY_Image)); } // If --print-supported-cpus, -mcpu=? or -mtune=? is specified, build a custom // Compile phase that prints out supported cpu models and quits. if (Arg *A = Args.getLastArg(options::OPT_print_supported_cpus)) { // Use the -mcpu=? flag as the dummy input to cc1. Actions.clear(); Action *InputAc = C.MakeAction(*A, types::TY_C); Actions.push_back( C.MakeAction(InputAc, types::TY_Nothing)); for (auto &I : Inputs) I.second->claim(); } // Call validator for dxil when -Vd not in Args. if (C.getDefaultToolChain().getTriple().isDXIL()) { // Only add action when needValidation. const auto &TC = static_cast(C.getDefaultToolChain()); if (TC.requiresValidation(Args)) { Action *LastAction = Actions.back(); Actions.push_back(C.MakeAction( LastAction, types::TY_DX_CONTAINER)); } } // Claim ignored clang-cl options. Args.ClaimAllArgs(options::OPT_cl_ignored_Group); } /// Returns the canonical name for the offloading architecture when using a HIP /// or CUDA architecture. static StringRef getCanonicalArchString(Compilation &C, const llvm::opt::DerivedArgList &Args, StringRef ArchStr, const llvm::Triple &Triple, bool SuppressError = false) { // Lookup the CUDA / HIP architecture string. Only report an error if we were // expecting the triple to be only NVPTX / AMDGPU. CudaArch Arch = StringToCudaArch(getProcessorFromTargetID(Triple, ArchStr)); if (!SuppressError && Triple.isNVPTX() && (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch))) { C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch) << "CUDA" << ArchStr; return StringRef(); } else if (!SuppressError && Triple.isAMDGPU() && (Arch == CudaArch::UNKNOWN || !IsAMDGpuArch(Arch))) { C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch) << "HIP" << ArchStr; return StringRef(); } if (IsNVIDIAGpuArch(Arch)) return Args.MakeArgStringRef(CudaArchToString(Arch)); if (IsAMDGpuArch(Arch)) { llvm::StringMap Features; auto HIPTriple = getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()); if (!HIPTriple) return StringRef(); auto Arch = parseTargetID(*HIPTriple, ArchStr, &Features); if (!Arch) { C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr; C.setContainsError(); return StringRef(); } return Args.MakeArgStringRef(getCanonicalTargetID(*Arch, Features)); } // If the input isn't CUDA or HIP just return the architecture. return ArchStr; } /// Checks if the set offloading architectures does not conflict. Returns the /// incompatible pair if a conflict occurs. static std::optional> getConflictOffloadArchCombination(const llvm::DenseSet &Archs, llvm::Triple Triple) { if (!Triple.isAMDGPU()) return std::nullopt; std::set ArchSet; llvm::copy(Archs, std::inserter(ArchSet, ArchSet.begin())); return getConflictTargetIDCombination(ArchSet); } llvm::DenseSet Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args, Action::OffloadKind Kind, const ToolChain *TC, bool SuppressError) const { if (!TC) TC = &C.getDefaultToolChain(); // --offload and --offload-arch options are mutually exclusive. if (Args.hasArgNoClaim(options::OPT_offload_EQ) && Args.hasArgNoClaim(options::OPT_offload_arch_EQ, options::OPT_no_offload_arch_EQ)) { C.getDriver().Diag(diag::err_opt_not_valid_with_opt) << "--offload" << (Args.hasArgNoClaim(options::OPT_offload_arch_EQ) ? "--offload-arch" : "--no-offload-arch"); } if (KnownArchs.contains(TC)) return KnownArchs.lookup(TC); llvm::DenseSet Archs; for (auto *Arg : Args) { // Extract any '--[no-]offload-arch' arguments intended for this toolchain. std::unique_ptr ExtractedArg = nullptr; if (Arg->getOption().matches(options::OPT_Xopenmp_target_EQ) && ToolChain::getOpenMPTriple(Arg->getValue(0)) == TC->getTriple()) { Arg->claim(); unsigned Index = Args.getBaseArgs().MakeIndex(Arg->getValue(1)); ExtractedArg = getOpts().ParseOneArg(Args, Index); Arg = ExtractedArg.get(); } // Add or remove the seen architectures in order of appearance. If an // invalid architecture is given we simply exit. if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) { for (StringRef Arch : llvm::split(Arg->getValue(), ",")) { if (Arch == "native" || Arch.empty()) { auto GPUsOrErr = TC->getSystemGPUArchs(Args); if (!GPUsOrErr) { if (SuppressError) llvm::consumeError(GPUsOrErr.takeError()); else TC->getDriver().Diag(diag::err_drv_undetermined_gpu_arch) << llvm::Triple::getArchTypeName(TC->getArch()) << llvm::toString(GPUsOrErr.takeError()) << "--offload-arch"; continue; } for (auto ArchStr : *GPUsOrErr) { Archs.insert( getCanonicalArchString(C, Args, Args.MakeArgString(ArchStr), TC->getTriple(), SuppressError)); } } else { StringRef ArchStr = getCanonicalArchString( C, Args, Arch, TC->getTriple(), SuppressError); if (ArchStr.empty()) return Archs; Archs.insert(ArchStr); } } } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) { for (StringRef Arch : llvm::split(Arg->getValue(), ",")) { if (Arch == "all") { Archs.clear(); } else { StringRef ArchStr = getCanonicalArchString( C, Args, Arch, TC->getTriple(), SuppressError); if (ArchStr.empty()) return Archs; Archs.erase(ArchStr); } } } } if (auto ConflictingArchs = getConflictOffloadArchCombination(Archs, TC->getTriple())) { C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo) << ConflictingArchs->first << ConflictingArchs->second; C.setContainsError(); } // Skip filling defaults if we're just querying what is availible. if (SuppressError) return Archs; if (Archs.empty()) { if (Kind == Action::OFK_Cuda) Archs.insert(CudaArchToString(CudaArch::CudaDefault)); else if (Kind == Action::OFK_HIP) Archs.insert(CudaArchToString(CudaArch::HIPDefault)); else if (Kind == Action::OFK_OpenMP) Archs.insert(StringRef()); } else { Args.ClaimAllArgs(options::OPT_offload_arch_EQ); Args.ClaimAllArgs(options::OPT_no_offload_arch_EQ); } return Archs; } Action *Driver::BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, const InputTy &Input, Action *HostAction) const { // Don't build offloading actions if explicitly disabled or we do not have a // valid source input and compile action to embed it in. If preprocessing only // ignore embedding. if (offloadHostOnly() || !types::isSrcFile(Input.first) || !(isa(HostAction) || getFinalPhase(Args) == phases::Preprocess)) return HostAction; ActionList OffloadActions; OffloadAction::DeviceDependences DDeps; const Action::OffloadKind OffloadKinds[] = { Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP}; for (Action::OffloadKind Kind : OffloadKinds) { SmallVector ToolChains; ActionList DeviceActions; auto TCRange = C.getOffloadToolChains(Kind); for (auto TI = TCRange.first, TE = TCRange.second; TI != TE; ++TI) ToolChains.push_back(TI->second); if (ToolChains.empty()) continue; types::ID InputType = Input.first; const Arg *InputArg = Input.second; // The toolchain can be active for unsupported file types. if ((Kind == Action::OFK_Cuda && !types::isCuda(InputType)) || (Kind == Action::OFK_HIP && !types::isHIP(InputType))) continue; // Get the product of all bound architectures and toolchains. SmallVector> TCAndArchs; for (const ToolChain *TC : ToolChains) for (StringRef Arch : getOffloadArchs(C, Args, Kind, TC)) TCAndArchs.push_back(std::make_pair(TC, Arch)); for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I) DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); if (DeviceActions.empty()) return HostAction; auto PL = types::getCompilationPhases(*this, Args, InputType); for (phases::ID Phase : PL) { if (Phase == phases::Link) { assert(Phase == PL.back() && "linking must be final compilation step."); break; } auto TCAndArch = TCAndArchs.begin(); for (Action *&A : DeviceActions) { if (A->getType() == types::TY_Nothing) continue; // Propagate the ToolChain so we can use it in ConstructPhaseAction. A->propagateDeviceOffloadInfo(Kind, TCAndArch->second.data(), TCAndArch->first); A = ConstructPhaseAction(C, Args, Phase, A, Kind); if (isa(A) && isa(HostAction) && Kind == Action::OFK_OpenMP && HostAction->getType() != types::TY_Nothing) { // OpenMP offloading has a dependency on the host compile action to // identify which declarations need to be emitted. This shouldn't be // collapsed with any other actions so we can use it in the device. HostAction->setCannotBeCollapsedWithNextDependentAction(); OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), TCAndArch->second.data(), Kind); OffloadAction::DeviceDependences DDep; DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); A = C.MakeAction(HDep, DDep); } ++TCAndArch; } } // Compiling HIP in non-RDC mode requires linking each action individually. for (Action *&A : DeviceActions) { if ((A->getType() != types::TY_Object && A->getType() != types::TY_LTO_BC) || Kind != Action::OFK_HIP || Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) continue; ActionList LinkerInput = {A}; A = C.MakeAction(LinkerInput, types::TY_Image); } auto TCAndArch = TCAndArchs.begin(); for (Action *A : DeviceActions) { DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); OffloadAction::DeviceDependences DDep; DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); OffloadActions.push_back(C.MakeAction(DDep, A->getType())); ++TCAndArch; } } if (offloadDeviceOnly()) return C.MakeAction(DDeps, types::TY_Nothing); if (OffloadActions.empty()) return HostAction; OffloadAction::DeviceDependences DDep; if (C.isOffloadingHostKind(Action::OFK_Cuda) && !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) { // If we are not in RDC-mode we just emit the final CUDA fatbinary for // each translation unit without requiring any linking. Action *FatbinAction = C.MakeAction(OffloadActions, types::TY_CUDA_FATBIN); DDep.add(*FatbinAction, *C.getSingleOffloadToolChain(), nullptr, Action::OFK_Cuda); } else if (C.isOffloadingHostKind(Action::OFK_HIP) && !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) { // If we are not in RDC-mode we just emit the final HIP fatbinary for each // translation unit, linking each input individually. Action *FatbinAction = C.MakeAction(OffloadActions, types::TY_HIP_FATBIN); DDep.add(*FatbinAction, *C.getSingleOffloadToolChain(), nullptr, Action::OFK_HIP); } else { // Package all the offloading actions into a single output that can be // embedded in the host and linked. Action *PackagerAction = C.MakeAction(OffloadActions, types::TY_Image); DDep.add(*PackagerAction, *C.getSingleOffloadToolChain(), nullptr, C.getActiveOffloadKinds()); } // If we are unable to embed a single device output into the host, we need to // add each device output as a host dependency to ensure they are still built. bool SingleDeviceOutput = !llvm::any_of(OffloadActions, [](Action *A) { return A->getType() == types::TY_Nothing; }) && isa(HostAction); OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), /*BoundArch=*/nullptr, SingleDeviceOutput ? DDep : DDeps); return C.MakeAction(HDep, SingleDeviceOutput ? DDep : DDeps); } Action *Driver::ConstructPhaseAction( Compilation &C, const ArgList &Args, phases::ID Phase, Action *Input, Action::OffloadKind TargetDeviceOffloadKind) const { llvm::PrettyStackTraceString CrashInfo("Constructing phase actions"); // Some types skip the assembler phase (e.g., llvm-bc), but we can't // encode this in the steps because the intermediate type depends on // arguments. Just special case here. if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm) return Input; // Build the appropriate action. switch (Phase) { case phases::Link: llvm_unreachable("link action invalid here."); case phases::IfsMerge: llvm_unreachable("ifsmerge action invalid here."); case phases::Preprocess: { types::ID OutputTy; // -M and -MM specify the dependency file name by altering the output type, // -if -MD and -MMD are not specified. if (Args.hasArg(options::OPT_M, options::OPT_MM) && !Args.hasArg(options::OPT_MD, options::OPT_MMD)) { OutputTy = types::TY_Dependencies; } else { OutputTy = Input->getType(); // For these cases, the preprocessor is only translating forms, the Output // still needs preprocessing. if (!Args.hasFlag(options::OPT_frewrite_includes, options::OPT_fno_rewrite_includes, false) && !Args.hasFlag(options::OPT_frewrite_imports, options::OPT_fno_rewrite_imports, false) && !Args.hasFlag(options::OPT_fdirectives_only, options::OPT_fno_directives_only, false) && !CCGenDiagnostics) OutputTy = types::getPreprocessedType(OutputTy); assert(OutputTy != types::TY_INVALID && "Cannot preprocess this input type!"); } return C.MakeAction(Input, OutputTy); } case phases::Precompile: { // API extraction should not generate an actual precompilation action. if (Args.hasArg(options::OPT_extract_api)) return C.MakeAction(Input, types::TY_API_INFO); types::ID OutputTy = getPrecompiledType(Input->getType()); assert(OutputTy != types::TY_INVALID && "Cannot precompile this input type!"); // If we're given a module name, precompile header file inputs as a // module, not as a precompiled header. const char *ModName = nullptr; if (OutputTy == types::TY_PCH) { if (Arg *A = Args.getLastArg(options::OPT_fmodule_name_EQ)) ModName = A->getValue(); if (ModName) OutputTy = types::TY_ModuleFile; } if (Args.hasArg(options::OPT_fsyntax_only)) { // Syntax checks should not emit a PCH file OutputTy = types::TY_Nothing; } return C.MakeAction(Input, OutputTy); } case phases::Compile: { if (Args.hasArg(options::OPT_fsyntax_only)) return C.MakeAction(Input, types::TY_Nothing); if (Args.hasArg(options::OPT_rewrite_objc)) return C.MakeAction(Input, types::TY_RewrittenObjC); if (Args.hasArg(options::OPT_rewrite_legacy_objc)) return C.MakeAction(Input, types::TY_RewrittenLegacyObjC); if (Args.hasArg(options::OPT__analyze)) return C.MakeAction(Input, types::TY_Plist); if (Args.hasArg(options::OPT__migrate)) return C.MakeAction(Input, types::TY_Remap); if (Args.hasArg(options::OPT_emit_ast)) return C.MakeAction(Input, types::TY_AST); if (Args.hasArg(options::OPT_module_file_info)) return C.MakeAction(Input, types::TY_ModuleFile); if (Args.hasArg(options::OPT_verify_pch)) return C.MakeAction(Input, types::TY_Nothing); if (Args.hasArg(options::OPT_extract_api)) return C.MakeAction(Input, types::TY_API_INFO); return C.MakeAction(Input, types::TY_LLVM_BC); } case phases::Backend: { if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) { types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC; return C.MakeAction(Input, Output); } if (isUsingLTO(/* IsOffload */ true) && TargetDeviceOffloadKind != Action::OFK_None) { types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC; return C.MakeAction(Input, Output); } if (Args.hasArg(options::OPT_emit_llvm) || (((Input->getOffloadingToolChain() && Input->getOffloadingToolChain()->getTriple().isAMDGPU()) || TargetDeviceOffloadKind == Action::OFK_HIP) && (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) || TargetDeviceOffloadKind == Action::OFK_OpenMP))) { types::ID Output = Args.hasArg(options::OPT_S) && (TargetDeviceOffloadKind == Action::OFK_None || offloadDeviceOnly() || (TargetDeviceOffloadKind == Action::OFK_HIP && !Args.hasFlag(options::OPT_offload_new_driver, options::OPT_no_offload_new_driver, false))) ? types::TY_LLVM_IR : types::TY_LLVM_BC; return C.MakeAction(Input, Output); } return C.MakeAction(Input, types::TY_PP_Asm); } case phases::Assemble: return C.MakeAction(std::move(Input), types::TY_Object); } llvm_unreachable("invalid phase in ConstructPhaseAction"); } void Driver::BuildJobs(Compilation &C) const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); Arg *FinalOutput = C.getArgs().getLastArg(options::OPT_o); // It is an error to provide a -o option if we are making multiple output // files. There are exceptions: // // IfsMergeJob: when generating interface stubs enabled we want to be able to // generate the stub file at the same time that we generate the real // library/a.out. So when a .o, .so, etc are the output, with clang interface // stubs there will also be a .ifs and .ifso at the same location. // // CompileJob of type TY_IFS_CPP: when generating interface stubs is enabled // and -c is passed, we still want to be able to generate a .ifs file while // we are also generating .o files. So we allow more than one output file in // this case as well. // // OffloadClass of type TY_Nothing: device-only output will place many outputs // into a single offloading action. We should count all inputs to the action // as outputs. Also ignore device-only outputs if we're compiling with // -fsyntax-only. if (FinalOutput) { unsigned NumOutputs = 0; unsigned NumIfsOutputs = 0; for (const Action *A : C.getActions()) { if (A->getType() != types::TY_Nothing && A->getType() != types::TY_DX_CONTAINER && !(A->getKind() == Action::IfsMergeJobClass || (A->getType() == clang::driver::types::TY_IFS_CPP && A->getKind() == clang::driver::Action::CompileJobClass && 0 == NumIfsOutputs++) || (A->getKind() == Action::BindArchClass && A->getInputs().size() && A->getInputs().front()->getKind() == Action::IfsMergeJobClass))) ++NumOutputs; else if (A->getKind() == Action::OffloadClass && A->getType() == types::TY_Nothing && !C.getArgs().hasArg(options::OPT_fsyntax_only)) NumOutputs += A->size(); } if (NumOutputs > 1) { Diag(clang::diag::err_drv_output_argument_with_multiple_files); FinalOutput = nullptr; } } const llvm::Triple &RawTriple = C.getDefaultToolChain().getTriple(); // Collect the list of architectures. llvm::StringSet<> ArchNames; if (RawTriple.isOSBinFormatMachO()) for (const Arg *A : C.getArgs()) if (A->getOption().matches(options::OPT_arch)) ArchNames.insert(A->getValue()); // Set of (Action, canonical ToolChain triple) pairs we've built jobs for. std::map, InputInfoList> CachedResults; for (Action *A : C.getActions()) { // If we are linking an image for multiple archs then the linker wants // -arch_multiple and -final_output . Unfortunately, this // doesn't fit in cleanly because we have to pass this information down. // // FIXME: This is a hack; find a cleaner way to integrate this into the // process. const char *LinkingOutput = nullptr; if (isa(A)) { if (FinalOutput) LinkingOutput = FinalOutput->getValue(); else LinkingOutput = getDefaultImageName(); } BuildJobsForAction(C, A, &C.getDefaultToolChain(), /*BoundArch*/ StringRef(), /*AtTopLevel*/ true, /*MultipleArchs*/ ArchNames.size() > 1, /*LinkingOutput*/ LinkingOutput, CachedResults, /*TargetDeviceOffloadKind*/ Action::OFK_None); } // If we have more than one job, then disable integrated-cc1 for now. Do this // also when we need to report process execution statistics. if (C.getJobs().size() > 1 || CCPrintProcessStats) for (auto &J : C.getJobs()) J.InProcess = false; if (CCPrintProcessStats) { C.setPostCallback([=](const Command &Cmd, int Res) { std::optional ProcStat = Cmd.getProcessStatistics(); if (!ProcStat) return; const char *LinkingOutput = nullptr; if (FinalOutput) LinkingOutput = FinalOutput->getValue(); else if (!Cmd.getOutputFilenames().empty()) LinkingOutput = Cmd.getOutputFilenames().front().c_str(); else LinkingOutput = getDefaultImageName(); if (CCPrintStatReportFilename.empty()) { using namespace llvm; // Human readable output. outs() << sys::path::filename(Cmd.getExecutable()) << ": " << "output=" << LinkingOutput; outs() << ", total=" << format("%.3f", ProcStat->TotalTime.count() / 1000.) << " ms" << ", user=" << format("%.3f", ProcStat->UserTime.count() / 1000.) << " ms" << ", mem=" << ProcStat->PeakMemory << " Kb\n"; } else { // CSV format. std::string Buffer; llvm::raw_string_ostream Out(Buffer); llvm::sys::printArg(Out, llvm::sys::path::filename(Cmd.getExecutable()), /*Quote*/ true); Out << ','; llvm::sys::printArg(Out, LinkingOutput, true); Out << ',' << ProcStat->TotalTime.count() << ',' << ProcStat->UserTime.count() << ',' << ProcStat->PeakMemory << '\n'; Out.flush(); std::error_code EC; llvm::raw_fd_ostream OS(CCPrintStatReportFilename, EC, llvm::sys::fs::OF_Append | llvm::sys::fs::OF_Text); if (EC) return; auto L = OS.lock(); if (!L) { llvm::errs() << "ERROR: Cannot lock file " << CCPrintStatReportFilename << ": " << toString(L.takeError()) << "\n"; return; } OS << Buffer; OS.flush(); } }); } // If the user passed -Qunused-arguments or there were errors, don't warn // about any unused arguments. if (Diags.hasErrorOccurred() || C.getArgs().hasArg(options::OPT_Qunused_arguments)) return; // Claim -fdriver-only here. (void)C.getArgs().hasArg(options::OPT_fdriver_only); // Claim -### here. (void)C.getArgs().hasArg(options::OPT__HASH_HASH_HASH); // Claim --driver-mode, --rsp-quoting, it was handled earlier. (void)C.getArgs().hasArg(options::OPT_driver_mode); (void)C.getArgs().hasArg(options::OPT_rsp_quoting); + bool HasAssembleJob = llvm::any_of(C.getJobs(), [](auto &J) { + // Match ClangAs and other derived assemblers of Tool. ClangAs uses a + // longer ShortName "clang integrated assembler" while other assemblers just + // use "assembler". + return strstr(J.getCreator().getShortName(), "assembler"); + }); for (Arg *A : C.getArgs()) { // FIXME: It would be nice to be able to send the argument to the // DiagnosticsEngine, so that extra values, position, and so on could be // printed. if (!A->isClaimed()) { if (A->getOption().hasFlag(options::NoArgumentUnused)) continue; // Suppress the warning automatically if this is just a flag, and it is an // instance of an argument we already claimed. const Option &Opt = A->getOption(); if (Opt.getKind() == Option::FlagClass) { bool DuplicateClaimed = false; for (const Arg *AA : C.getArgs().filtered(&Opt)) { if (AA->isClaimed()) { DuplicateClaimed = true; break; } } if (DuplicateClaimed) continue; } // In clang-cl, don't mention unknown arguments here since they have // already been warned about. if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN)) { if (A->getOption().hasFlag(options::TargetSpecific) && - !A->isIgnoredTargetSpecific()) { + !A->isIgnoredTargetSpecific() && !HasAssembleJob) { Diag(diag::err_drv_unsupported_opt_for_target) << A->getSpelling() << getTargetTriple(); } else { Diag(clang::diag::warn_drv_unused_argument) << A->getAsString(C.getArgs()); } } } } } namespace { /// Utility class to control the collapse of dependent actions and select the /// tools accordingly. class ToolSelector final { /// The tool chain this selector refers to. const ToolChain &TC; /// The compilation this selector refers to. const Compilation &C; /// The base action this selector refers to. const JobAction *BaseAction; /// Set to true if the current toolchain refers to host actions. bool IsHostSelector; /// Set to true if save-temps and embed-bitcode functionalities are active. bool SaveTemps; bool EmbedBitcode; /// Get previous dependent action or null if that does not exist. If /// \a CanBeCollapsed is false, that action must be legal to collapse or /// null will be returned. const JobAction *getPrevDependentAction(const ActionList &Inputs, ActionList &SavedOffloadAction, bool CanBeCollapsed = true) { // An option can be collapsed only if it has a single input. if (Inputs.size() != 1) return nullptr; Action *CurAction = *Inputs.begin(); if (CanBeCollapsed && !CurAction->isCollapsingWithNextDependentActionLegal()) return nullptr; // If the input action is an offload action. Look through it and save any // offload action that can be dropped in the event of a collapse. if (auto *OA = dyn_cast(CurAction)) { // If the dependent action is a device action, we will attempt to collapse // only with other device actions. Otherwise, we would do the same but // with host actions only. if (!IsHostSelector) { if (OA->hasSingleDeviceDependence(/*DoNotConsiderHostActions=*/true)) { CurAction = OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true); if (CanBeCollapsed && !CurAction->isCollapsingWithNextDependentActionLegal()) return nullptr; SavedOffloadAction.push_back(OA); return dyn_cast(CurAction); } } else if (OA->hasHostDependence()) { CurAction = OA->getHostDependence(); if (CanBeCollapsed && !CurAction->isCollapsingWithNextDependentActionLegal()) return nullptr; SavedOffloadAction.push_back(OA); return dyn_cast(CurAction); } return nullptr; } return dyn_cast(CurAction); } /// Return true if an assemble action can be collapsed. bool canCollapseAssembleAction() const { return TC.useIntegratedAs() && !SaveTemps && !C.getArgs().hasArg(options::OPT_via_file_asm) && !C.getArgs().hasArg(options::OPT__SLASH_FA) && !C.getArgs().hasArg(options::OPT__SLASH_Fa); } /// Return true if a preprocessor action can be collapsed. bool canCollapsePreprocessorAction() const { return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && !C.getArgs().hasArg(options::OPT_rewrite_objc); } /// Struct that relates an action with the offload actions that would be /// collapsed with it. struct JobActionInfo final { /// The action this info refers to. const JobAction *JA = nullptr; /// The offload actions we need to take care off if this action is /// collapsed. ActionList SavedOffloadAction; }; /// Append collapsed offload actions from the give nnumber of elements in the /// action info array. static void AppendCollapsedOffloadAction(ActionList &CollapsedOffloadAction, ArrayRef &ActionInfo, unsigned ElementNum) { assert(ElementNum <= ActionInfo.size() && "Invalid number of elements."); for (unsigned I = 0; I < ElementNum; ++I) CollapsedOffloadAction.append(ActionInfo[I].SavedOffloadAction.begin(), ActionInfo[I].SavedOffloadAction.end()); } /// Functions that attempt to perform the combining. They detect if that is /// legal, and if so they update the inputs \a Inputs and the offload action /// that were collapsed in \a CollapsedOffloadAction. A tool that deals with /// the combined action is returned. If the combining is not legal or if the /// tool does not exist, null is returned. /// Currently three kinds of collapsing are supported: /// - Assemble + Backend + Compile; /// - Assemble + Backend ; /// - Backend + Compile. const Tool * combineAssembleBackendCompile(ArrayRef ActionInfo, ActionList &Inputs, ActionList &CollapsedOffloadAction) { if (ActionInfo.size() < 3 || !canCollapseAssembleAction()) return nullptr; auto *AJ = dyn_cast(ActionInfo[0].JA); auto *BJ = dyn_cast(ActionInfo[1].JA); auto *CJ = dyn_cast(ActionInfo[2].JA); if (!AJ || !BJ || !CJ) return nullptr; // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) return nullptr; // Can't collapse if we don't have codegen support unless we are // emitting LLVM IR. bool OutputIsLLVM = types::isLLVMIR(ActionInfo[0].JA->getType()); if (!T->hasIntegratedBackend() && !(OutputIsLLVM && T->canEmitIR())) return nullptr; // When using -fembed-bitcode, it is required to have the same tool (clang) // for both CompilerJA and BackendJA. Otherwise, combine two stages. if (EmbedBitcode) { const Tool *BT = TC.SelectTool(*BJ); if (BT == T) return nullptr; } if (!T->hasIntegratedAssembler()) return nullptr; Inputs = CJ->getInputs(); AppendCollapsedOffloadAction(CollapsedOffloadAction, ActionInfo, /*NumElements=*/3); return T; } const Tool *combineAssembleBackend(ArrayRef ActionInfo, ActionList &Inputs, ActionList &CollapsedOffloadAction) { if (ActionInfo.size() < 2 || !canCollapseAssembleAction()) return nullptr; auto *AJ = dyn_cast(ActionInfo[0].JA); auto *BJ = dyn_cast(ActionInfo[1].JA); if (!AJ || !BJ) return nullptr; // Get backend tool. const Tool *T = TC.SelectTool(*BJ); if (!T) return nullptr; if (!T->hasIntegratedAssembler()) return nullptr; Inputs = BJ->getInputs(); AppendCollapsedOffloadAction(CollapsedOffloadAction, ActionInfo, /*NumElements=*/2); return T; } const Tool *combineBackendCompile(ArrayRef ActionInfo, ActionList &Inputs, ActionList &CollapsedOffloadAction) { if (ActionInfo.size() < 2) return nullptr; auto *BJ = dyn_cast(ActionInfo[0].JA); auto *CJ = dyn_cast(ActionInfo[1].JA); if (!BJ || !CJ) return nullptr; // Check if the initial input (to the compile job or its predessor if one // exists) is LLVM bitcode. In that case, no preprocessor step is required // and we can still collapse the compile and backend jobs when we have // -save-temps. I.e. there is no need for a separate compile job just to // emit unoptimized bitcode. bool InputIsBitcode = true; for (size_t i = 1; i < ActionInfo.size(); i++) if (ActionInfo[i].JA->getType() != types::TY_LLVM_BC && ActionInfo[i].JA->getType() != types::TY_LTO_BC) { InputIsBitcode = false; break; } if (!InputIsBitcode && !canCollapsePreprocessorAction()) return nullptr; // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) return nullptr; // Can't collapse if we don't have codegen support unless we are // emitting LLVM IR. bool OutputIsLLVM = types::isLLVMIR(ActionInfo[0].JA->getType()); if (!T->hasIntegratedBackend() && !(OutputIsLLVM && T->canEmitIR())) return nullptr; if (T->canEmitIR() && ((SaveTemps && !InputIsBitcode) || EmbedBitcode)) return nullptr; Inputs = CJ->getInputs(); AppendCollapsedOffloadAction(CollapsedOffloadAction, ActionInfo, /*NumElements=*/2); return T; } /// Updates the inputs if the obtained tool supports combining with /// preprocessor action, and the current input is indeed a preprocessor /// action. If combining results in the collapse of offloading actions, those /// are appended to \a CollapsedOffloadAction. void combineWithPreprocessor(const Tool *T, ActionList &Inputs, ActionList &CollapsedOffloadAction) { if (!T || !canCollapsePreprocessorAction() || !T->hasIntegratedCPP()) return; // Attempt to get a preprocessor action dependence. ActionList PreprocessJobOffloadActions; ActionList NewInputs; for (Action *A : Inputs) { auto *PJ = getPrevDependentAction({A}, PreprocessJobOffloadActions); if (!PJ || !isa(PJ)) { NewInputs.push_back(A); continue; } // This is legal to combine. Append any offload action we found and add the // current input to preprocessor inputs. CollapsedOffloadAction.append(PreprocessJobOffloadActions.begin(), PreprocessJobOffloadActions.end()); NewInputs.append(PJ->input_begin(), PJ->input_end()); } Inputs = NewInputs; } public: ToolSelector(const JobAction *BaseAction, const ToolChain &TC, const Compilation &C, bool SaveTemps, bool EmbedBitcode) : TC(TC), C(C), BaseAction(BaseAction), SaveTemps(SaveTemps), EmbedBitcode(EmbedBitcode) { assert(BaseAction && "Invalid base action."); IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None; } /// Check if a chain of actions can be combined and return the tool that can /// handle the combination of actions. The pointer to the current inputs \a /// Inputs and the list of offload actions \a CollapsedOffloadActions /// connected to collapsed actions are updated accordingly. The latter enables /// the caller of the selector to process them afterwards instead of just /// dropping them. If no suitable tool is found, null will be returned. const Tool *getTool(ActionList &Inputs, ActionList &CollapsedOffloadAction) { // // Get the largest chain of actions that we could combine. // SmallVector ActionChain(1); ActionChain.back().JA = BaseAction; while (ActionChain.back().JA) { const Action *CurAction = ActionChain.back().JA; // Grow the chain by one element. ActionChain.resize(ActionChain.size() + 1); JobActionInfo &AI = ActionChain.back(); // Attempt to fill it with the AI.JA = getPrevDependentAction(CurAction->getInputs(), AI.SavedOffloadAction); } // Pop the last action info as it could not be filled. ActionChain.pop_back(); // // Attempt to combine actions. If all combining attempts failed, just return // the tool of the provided action. At the end we attempt to combine the // action with any preprocessor action it may depend on. // const Tool *T = combineAssembleBackendCompile(ActionChain, Inputs, CollapsedOffloadAction); if (!T) T = combineAssembleBackend(ActionChain, Inputs, CollapsedOffloadAction); if (!T) T = combineBackendCompile(ActionChain, Inputs, CollapsedOffloadAction); if (!T) { Inputs = BaseAction->getInputs(); T = TC.SelectTool(*BaseAction); } combineWithPreprocessor(T, Inputs, CollapsedOffloadAction); return T; } }; } /// Return a string that uniquely identifies the result of a job. The bound arch /// is not necessarily represented in the toolchain's triple -- for example, /// armv7 and armv7s both map to the same triple -- so we need both in our map. /// Also, we need to add the offloading device kind, as the same tool chain can /// be used for host and device for some programming models, e.g. OpenMP. static std::string GetTriplePlusArchString(const ToolChain *TC, StringRef BoundArch, Action::OffloadKind OffloadKind) { std::string TriplePlusArch = TC->getTriple().normalize(); if (!BoundArch.empty()) { TriplePlusArch += "-"; TriplePlusArch += BoundArch; } TriplePlusArch += "-"; TriplePlusArch += Action::GetOffloadKindName(OffloadKind); return TriplePlusArch; } InputInfoList Driver::BuildJobsForAction( Compilation &C, const Action *A, const ToolChain *TC, StringRef BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, std::map, InputInfoList> &CachedResults, Action::OffloadKind TargetDeviceOffloadKind) const { std::pair ActionTC = { A, GetTriplePlusArchString(TC, BoundArch, TargetDeviceOffloadKind)}; auto CachedResult = CachedResults.find(ActionTC); if (CachedResult != CachedResults.end()) { return CachedResult->second; } InputInfoList Result = BuildJobsForActionNoCache( C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput, CachedResults, TargetDeviceOffloadKind); CachedResults[ActionTC] = Result; return Result; } static void handleTimeTrace(Compilation &C, const ArgList &Args, const JobAction *JA, const char *BaseInput, const InputInfo &Result) { Arg *A = Args.getLastArg(options::OPT_ftime_trace, options::OPT_ftime_trace_EQ); if (!A) return; SmallString<128> Path; if (A->getOption().matches(options::OPT_ftime_trace_EQ)) { Path = A->getValue(); if (llvm::sys::fs::is_directory(Path)) { SmallString<128> Tmp(Result.getFilename()); llvm::sys::path::replace_extension(Tmp, "json"); llvm::sys::path::append(Path, llvm::sys::path::filename(Tmp)); } } else { if (Arg *DumpDir = Args.getLastArgNoClaim(options::OPT_dumpdir)) { // The trace file is ${dumpdir}${basename}.json. Note that dumpdir may not // end with a path separator. Path = DumpDir->getValue(); Path += llvm::sys::path::filename(BaseInput); } else { Path = Result.getFilename(); } llvm::sys::path::replace_extension(Path, "json"); } const char *ResultFile = C.getArgs().MakeArgString(Path); C.addTimeTraceFile(ResultFile, JA); C.addResultFile(ResultFile, JA); } InputInfoList Driver::BuildJobsForActionNoCache( Compilation &C, const Action *A, const ToolChain *TC, StringRef BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, std::map, InputInfoList> &CachedResults, Action::OffloadKind TargetDeviceOffloadKind) const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); InputInfoList OffloadDependencesInputInfo; bool BuildingForOffloadDevice = TargetDeviceOffloadKind != Action::OFK_None; if (const OffloadAction *OA = dyn_cast(A)) { // The 'Darwin' toolchain is initialized only when its arguments are // computed. Get the default arguments for OFK_None to ensure that // initialization is performed before processing the offload action. // FIXME: Remove when darwin's toolchain is initialized during construction. C.getArgsForToolChain(TC, BoundArch, Action::OFK_None); // The offload action is expected to be used in four different situations. // // a) Set a toolchain/architecture/kind for a host action: // Host Action 1 -> OffloadAction -> Host Action 2 // // b) Set a toolchain/architecture/kind for a device action; // Device Action 1 -> OffloadAction -> Device Action 2 // // c) Specify a device dependence to a host action; // Device Action 1 _ // \ // Host Action 1 ---> OffloadAction -> Host Action 2 // // d) Specify a host dependence to a device action. // Host Action 1 _ // \ // Device Action 1 ---> OffloadAction -> Device Action 2 // // For a) and b), we just return the job generated for the dependences. For // c) and d) we override the current action with the host/device dependence // if the current toolchain is host/device and set the offload dependences // info with the jobs obtained from the device/host dependence(s). // If there is a single device option or has no host action, just generate // the job for it. if (OA->hasSingleDeviceDependence() || !OA->hasHostDependence()) { InputInfoList DevA; OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { DevA.append(BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel, /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults, DepA->getOffloadingDeviceKind())); }); return DevA; } // If 'Action 2' is host, we generate jobs for the device dependences and // override the current action with the host dependence. Otherwise, we // generate the host dependences and override the action with the device // dependence. The dependences can't therefore be a top-level action. OA->doOnEachDependence( /*IsHostDependence=*/BuildingForOffloadDevice, [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { OffloadDependencesInputInfo.append(BuildJobsForAction( C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false, /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults, DepA->getOffloadingDeviceKind())); }); A = BuildingForOffloadDevice ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true) : OA->getHostDependence(); // We may have already built this action as a part of the offloading // toolchain, return the cached input if so. std::pair ActionTC = { OA->getHostDependence(), GetTriplePlusArchString(TC, BoundArch, TargetDeviceOffloadKind)}; if (CachedResults.find(ActionTC) != CachedResults.end()) { InputInfoList Inputs = CachedResults[ActionTC]; Inputs.append(OffloadDependencesInputInfo); return Inputs; } } if (const InputAction *IA = dyn_cast(A)) { // FIXME: It would be nice to not claim this here; maybe the old scheme of // just using Args was better? const Arg &Input = IA->getInputArg(); Input.claim(); if (Input.getOption().matches(options::OPT_INPUT)) { const char *Name = Input.getValue(); return {InputInfo(A, Name, /* _BaseInput = */ Name)}; } return {InputInfo(A, &Input, /* _BaseInput = */ "")}; } if (const BindArchAction *BAA = dyn_cast(A)) { const ToolChain *TC; StringRef ArchName = BAA->getArchName(); if (!ArchName.empty()) TC = &getToolChain(C.getArgs(), computeTargetTriple(*this, TargetTriple, C.getArgs(), ArchName)); else TC = &C.getDefaultToolChain(); return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel, MultipleArchs, LinkingOutput, CachedResults, TargetDeviceOffloadKind); } ActionList Inputs = A->getInputs(); const JobAction *JA = cast(A); ActionList CollapsedOffloadActions; ToolSelector TS(JA, *TC, C, isSaveTempsEnabled(), embedBitcodeInObject() && !isUsingLTO()); const Tool *T = TS.getTool(Inputs, CollapsedOffloadActions); if (!T) return {InputInfo()}; // If we've collapsed action list that contained OffloadAction we // need to build jobs for host/device-side inputs it may have held. for (const auto *OA : CollapsedOffloadActions) cast(OA)->doOnEachDependence( /*IsHostDependence=*/BuildingForOffloadDevice, [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { OffloadDependencesInputInfo.append(BuildJobsForAction( C, DepA, DepTC, DepBoundArch, /* AtTopLevel */ false, /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults, DepA->getOffloadingDeviceKind())); }); // Only use pipes when there is exactly one input. InputInfoList InputInfos; for (const Action *Input : Inputs) { // Treat dsymutil and verify sub-jobs as being at the top-level too, they // shouldn't get temporary output names. // FIXME: Clean this up. bool SubJobAtTopLevel = AtTopLevel && (isa(A) || isa(A)); InputInfos.append(BuildJobsForAction( C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput, CachedResults, A->getOffloadingDeviceKind())); } // Always use the first file input as the base input. const char *BaseInput = InputInfos[0].getBaseInput(); for (auto &Info : InputInfos) { if (Info.isFilename()) { BaseInput = Info.getBaseInput(); break; } } // ... except dsymutil actions, which use their actual input as the base // input. if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); // Append outputs of offload device jobs to the input list if (!OffloadDependencesInputInfo.empty()) InputInfos.append(OffloadDependencesInputInfo.begin(), OffloadDependencesInputInfo.end()); // Set the effective triple of the toolchain for the duration of this job. llvm::Triple EffectiveTriple; const ToolChain &ToolTC = T->getToolChain(); const ArgList &Args = C.getArgsForToolChain(TC, BoundArch, A->getOffloadingDeviceKind()); if (InputInfos.size() != 1) { EffectiveTriple = llvm::Triple(ToolTC.ComputeEffectiveClangTriple(Args)); } else { // Pass along the input type if it can be unambiguously determined. EffectiveTriple = llvm::Triple( ToolTC.ComputeEffectiveClangTriple(Args, InputInfos[0].getType())); } RegisterEffectiveTriple TripleRAII(ToolTC, EffectiveTriple); // Determine the place to write output to, if any. InputInfo Result; InputInfoList UnbundlingResults; if (auto *UA = dyn_cast(JA)) { // If we have an unbundling job, we need to create results for all the // outputs. We also update the results cache so that other actions using // this unbundling action can get the right results. for (auto &UI : UA->getDependentActionsInfo()) { assert(UI.DependentOffloadKind != Action::OFK_None && "Unbundling with no offloading??"); // Unbundling actions are never at the top level. When we generate the // offloading prefix, we also do that for the host file because the // unbundling action does not change the type of the output which can // cause a overwrite. std::string OffloadingPrefix = Action::GetOffloadingFileNamePrefix( UI.DependentOffloadKind, UI.DependentToolChain->getTriple().normalize(), /*CreatePrefixForHost=*/true); auto CurI = InputInfo( UA, GetNamedOutputPath(C, *UA, BaseInput, UI.DependentBoundArch, /*AtTopLevel=*/false, MultipleArchs || UI.DependentOffloadKind == Action::OFK_HIP, OffloadingPrefix), BaseInput); // Save the unbundling result. UnbundlingResults.push_back(CurI); // Get the unique string identifier for this dependence and cache the // result. StringRef Arch; if (TargetDeviceOffloadKind == Action::OFK_HIP) { if (UI.DependentOffloadKind == Action::OFK_Host) Arch = StringRef(); else Arch = UI.DependentBoundArch; } else Arch = BoundArch; CachedResults[{A, GetTriplePlusArchString(UI.DependentToolChain, Arch, UI.DependentOffloadKind)}] = { CurI}; } // Now that we have all the results generated, select the one that should be // returned for the current depending action. std::pair ActionTC = { A, GetTriplePlusArchString(TC, BoundArch, TargetDeviceOffloadKind)}; assert(CachedResults.find(ActionTC) != CachedResults.end() && "Result does not exist??"); Result = CachedResults[ActionTC].front(); } else if (JA->getType() == types::TY_Nothing) Result = {InputInfo(A, BaseInput)}; else { // We only have to generate a prefix for the host if this is not a top-level // action. std::string OffloadingPrefix = Action::GetOffloadingFileNamePrefix( A->getOffloadingDeviceKind(), TC->getTriple().normalize(), /*CreatePrefixForHost=*/isa(A) || !(A->getOffloadingHostActiveKinds() == Action::OFK_None || AtTopLevel)); Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, AtTopLevel, MultipleArchs, OffloadingPrefix), BaseInput); if (T->canEmitIR() && OffloadingPrefix.empty()) handleTimeTrace(C, Args, JA, BaseInput, Result); } if (CCCPrintBindings && !CCGenDiagnostics) { llvm::errs() << "# \"" << T->getToolChain().getTripleString() << '"' << " - \"" << T->getName() << "\", inputs: ["; for (unsigned i = 0, e = InputInfos.size(); i != e; ++i) { llvm::errs() << InputInfos[i].getAsString(); if (i + 1 != e) llvm::errs() << ", "; } if (UnbundlingResults.empty()) llvm::errs() << "], output: " << Result.getAsString() << "\n"; else { llvm::errs() << "], outputs: ["; for (unsigned i = 0, e = UnbundlingResults.size(); i != e; ++i) { llvm::errs() << UnbundlingResults[i].getAsString(); if (i + 1 != e) llvm::errs() << ", "; } llvm::errs() << "] \n"; } } else { if (UnbundlingResults.empty()) T->ConstructJob( C, *JA, Result, InputInfos, C.getArgsForToolChain(TC, BoundArch, JA->getOffloadingDeviceKind()), LinkingOutput); else T->ConstructJobMultipleOutputs( C, *JA, UnbundlingResults, InputInfos, C.getArgsForToolChain(TC, BoundArch, JA->getOffloadingDeviceKind()), LinkingOutput); } return {Result}; } const char *Driver::getDefaultImageName() const { llvm::Triple Target(llvm::Triple::normalize(TargetTriple)); return Target.isOSWindows() ? "a.exe" : "a.out"; } /// Create output filename based on ArgValue, which could either be a /// full filename, filename without extension, or a directory. If ArgValue /// does not provide a filename, then use BaseName, and use the extension /// suitable for FileType. static const char *MakeCLOutputFilename(const ArgList &Args, StringRef ArgValue, StringRef BaseName, types::ID FileType) { SmallString<128> Filename = ArgValue; if (ArgValue.empty()) { // If the argument is empty, output to BaseName in the current dir. Filename = BaseName; } else if (llvm::sys::path::is_separator(Filename.back())) { // If the argument is a directory, output to BaseName in that dir. llvm::sys::path::append(Filename, BaseName); } if (!llvm::sys::path::has_extension(ArgValue)) { // If the argument didn't provide an extension, then set it. const char *Extension = types::getTypeTempSuffix(FileType, true); if (FileType == types::TY_Image && Args.hasArg(options::OPT__SLASH_LD, options::OPT__SLASH_LDd)) { // The output file is a dll. Extension = "dll"; } llvm::sys::path::replace_extension(Filename, Extension); } return Args.MakeArgString(Filename.c_str()); } static bool HasPreprocessOutput(const Action &JA) { if (isa(JA)) return true; if (isa(JA) && isa(JA.getInputs()[0])) return true; if (isa(JA) && HasPreprocessOutput(*(JA.getInputs()[0]))) return true; return false; } const char *Driver::CreateTempFile(Compilation &C, StringRef Prefix, StringRef Suffix, bool MultipleArchs, StringRef BoundArch, bool NeedUniqueDirectory) const { SmallString<128> TmpName; Arg *A = C.getArgs().getLastArg(options::OPT_fcrash_diagnostics_dir); std::optional CrashDirectory = CCGenDiagnostics && A ? std::string(A->getValue()) : llvm::sys::Process::GetEnv("CLANG_CRASH_DIAGNOSTICS_DIR"); if (CrashDirectory) { if (!getVFS().exists(*CrashDirectory)) llvm::sys::fs::create_directories(*CrashDirectory); SmallString<128> Path(*CrashDirectory); llvm::sys::path::append(Path, Prefix); const char *Middle = !Suffix.empty() ? "-%%%%%%." : "-%%%%%%"; if (std::error_code EC = llvm::sys::fs::createUniqueFile(Path + Middle + Suffix, TmpName)) { Diag(clang::diag::err_unable_to_make_temp) << EC.message(); return ""; } } else { if (MultipleArchs && !BoundArch.empty()) { if (NeedUniqueDirectory) { TmpName = GetTemporaryDirectory(Prefix); llvm::sys::path::append(TmpName, Twine(Prefix) + "-" + BoundArch + "." + Suffix); } else { TmpName = GetTemporaryPath((Twine(Prefix) + "-" + BoundArch).str(), Suffix); } } else { TmpName = GetTemporaryPath(Prefix, Suffix); } } return C.addTempFile(C.getArgs().MakeArgString(TmpName)); } // Calculate the output path of the module file when compiling a module unit // with the `-fmodule-output` option or `-fmodule-output=` option specified. // The behavior is: // - If `-fmodule-output=` is specfied, then the module file is // writing to the value. // - Otherwise if the output object file of the module unit is specified, the // output path // of the module file should be the same with the output object file except // the corresponding suffix. This requires both `-o` and `-c` are specified. // - Otherwise, the output path of the module file will be the same with the // input with the corresponding suffix. static const char *GetModuleOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput) { assert(isa(JA) && JA.getType() == types::TY_ModuleFile && (C.getArgs().hasArg(options::OPT_fmodule_output) || C.getArgs().hasArg(options::OPT_fmodule_output_EQ))); if (Arg *ModuleOutputEQ = C.getArgs().getLastArg(options::OPT_fmodule_output_EQ)) return C.addResultFile(ModuleOutputEQ->getValue(), &JA); SmallString<64> OutputPath; Arg *FinalOutput = C.getArgs().getLastArg(options::OPT_o); if (FinalOutput && C.getArgs().hasArg(options::OPT_c)) OutputPath = FinalOutput->getValue(); else OutputPath = BaseInput; const char *Extension = types::getTypeTempSuffix(JA.getType()); llvm::sys::path::replace_extension(OutputPath, Extension); return C.addResultFile(C.getArgs().MakeArgString(OutputPath.c_str()), &JA); } const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput, StringRef OrigBoundArch, bool AtTopLevel, bool MultipleArchs, StringRef OffloadingPrefix) const { std::string BoundArch = OrigBoundArch.str(); if (is_style_windows(llvm::sys::path::Style::native)) { // BoundArch may contains ':', which is invalid in file names on Windows, // therefore replace it with '%'. std::replace(BoundArch.begin(), BoundArch.end(), ':', '@'); } llvm::PrettyStackTraceString CrashInfo("Computing output path"); // Output to a user requested destination? if (AtTopLevel && !isa(JA) && !isa(JA)) { if (Arg *FinalOutput = C.getArgs().getLastArg(options::OPT_o)) return C.addResultFile(FinalOutput->getValue(), &JA); } // For /P, preprocess to file named after BaseInput. if (C.getArgs().hasArg(options::OPT__SLASH_P)) { assert(AtTopLevel && isa(JA)); StringRef BaseName = llvm::sys::path::filename(BaseInput); StringRef NameArg; if (Arg *A = C.getArgs().getLastArg(options::OPT__SLASH_Fi)) NameArg = A->getValue(); return C.addResultFile( MakeCLOutputFilename(C.getArgs(), NameArg, BaseName, types::TY_PP_C), &JA); } // Default to writing to stdout? if (AtTopLevel && !CCGenDiagnostics && HasPreprocessOutput(JA)) { return "-"; } if (JA.getType() == types::TY_ModuleFile && C.getArgs().getLastArg(options::OPT_module_file_info)) { return "-"; } if (IsDXCMode() && !C.getArgs().hasArg(options::OPT_o)) return "-"; // Is this the assembly listing for /FA? if (JA.getType() == types::TY_PP_Asm && (C.getArgs().hasArg(options::OPT__SLASH_FA) || C.getArgs().hasArg(options::OPT__SLASH_Fa))) { // Use /Fa and the input filename to determine the asm file name. StringRef BaseName = llvm::sys::path::filename(BaseInput); StringRef FaValue = C.getArgs().getLastArgValue(options::OPT__SLASH_Fa); return C.addResultFile( MakeCLOutputFilename(C.getArgs(), FaValue, BaseName, JA.getType()), &JA); } bool SpecifiedModuleOutput = C.getArgs().hasArg(options::OPT_fmodule_output) || C.getArgs().hasArg(options::OPT_fmodule_output_EQ); if (MultipleArchs && SpecifiedModuleOutput) Diag(clang::diag::err_drv_module_output_with_multiple_arch); // If we're emitting a module output with the specified option // `-fmodule-output`. if (!AtTopLevel && isa(JA) && JA.getType() == types::TY_ModuleFile && SpecifiedModuleOutput) return GetModuleOutputPath(C, JA, BaseInput); // Output to a temporary file? if ((!AtTopLevel && !isSaveTempsEnabled() && !C.getArgs().hasArg(options::OPT__SLASH_Fo)) || CCGenDiagnostics) { StringRef Name = llvm::sys::path::filename(BaseInput); std::pair Split = Name.split('.'); const char *Suffix = types::getTypeTempSuffix(JA.getType(), IsCLMode()); // The non-offloading toolchain on Darwin requires deterministic input // file name for binaries to be deterministic, therefore it needs unique // directory. llvm::Triple Triple(C.getDriver().getTargetTriple()); bool NeedUniqueDirectory = (JA.getOffloadingDeviceKind() == Action::OFK_None || JA.getOffloadingDeviceKind() == Action::OFK_Host) && Triple.isOSDarwin(); return CreateTempFile(C, Split.first, Suffix, MultipleArchs, BoundArch, NeedUniqueDirectory); } SmallString<128> BasePath(BaseInput); SmallString<128> ExternalPath(""); StringRef BaseName; // Dsymutil actions should use the full path. if (isa(JA) && C.getArgs().hasArg(options::OPT_dsym_dir)) { ExternalPath += C.getArgs().getLastArg(options::OPT_dsym_dir)->getValue(); // We use posix style here because the tests (specifically // darwin-dsymutil.c) demonstrate that posix style paths are acceptable // even on Windows and if we don't then the similar test covering this // fails. llvm::sys::path::append(ExternalPath, llvm::sys::path::Style::posix, llvm::sys::path::filename(BasePath)); BaseName = ExternalPath; } else if (isa(JA) || isa(JA)) BaseName = BasePath; else BaseName = llvm::sys::path::filename(BasePath); // Determine what the derived output name should be. const char *NamedOutput; if ((JA.getType() == types::TY_Object || JA.getType() == types::TY_LTO_BC) && C.getArgs().hasArg(options::OPT__SLASH_Fo, options::OPT__SLASH_o)) { // The /Fo or /o flag decides the object filename. StringRef Val = C.getArgs() .getLastArg(options::OPT__SLASH_Fo, options::OPT__SLASH_o) ->getValue(); NamedOutput = MakeCLOutputFilename(C.getArgs(), Val, BaseName, types::TY_Object); } else if (JA.getType() == types::TY_Image && C.getArgs().hasArg(options::OPT__SLASH_Fe, options::OPT__SLASH_o)) { // The /Fe or /o flag names the linked file. StringRef Val = C.getArgs() .getLastArg(options::OPT__SLASH_Fe, options::OPT__SLASH_o) ->getValue(); NamedOutput = MakeCLOutputFilename(C.getArgs(), Val, BaseName, types::TY_Image); } else if (JA.getType() == types::TY_Image) { if (IsCLMode()) { // clang-cl uses BaseName for the executable name. NamedOutput = MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image); } else { SmallString<128> Output(getDefaultImageName()); // HIP image for device compilation with -fno-gpu-rdc is per compilation // unit. bool IsHIPNoRDC = JA.getOffloadingDeviceKind() == Action::OFK_HIP && !C.getArgs().hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false); bool UseOutExtension = IsHIPNoRDC || isa(JA); if (UseOutExtension) { Output = BaseName; llvm::sys::path::replace_extension(Output, ""); } Output += OffloadingPrefix; if (MultipleArchs && !BoundArch.empty()) { Output += "-"; Output.append(BoundArch); } if (UseOutExtension) Output += ".out"; NamedOutput = C.getArgs().MakeArgString(Output.c_str()); } } else if (JA.getType() == types::TY_PCH && IsCLMode()) { NamedOutput = C.getArgs().MakeArgString(GetClPchPath(C, BaseName)); } else if ((JA.getType() == types::TY_Plist || JA.getType() == types::TY_AST) && C.getArgs().hasArg(options::OPT__SLASH_o)) { StringRef Val = C.getArgs() .getLastArg(options::OPT__SLASH_o) ->getValue(); NamedOutput = MakeCLOutputFilename(C.getArgs(), Val, BaseName, types::TY_Object); } else { const char *Suffix = types::getTypeTempSuffix(JA.getType(), IsCLMode()); assert(Suffix && "All types used for output should have a suffix."); std::string::size_type End = std::string::npos; if (!types::appendSuffixForType(JA.getType())) End = BaseName.rfind('.'); SmallString<128> Suffixed(BaseName.substr(0, End)); Suffixed += OffloadingPrefix; if (MultipleArchs && !BoundArch.empty()) { Suffixed += "-"; Suffixed.append(BoundArch); } // When using both -save-temps and -emit-llvm, use a ".tmp.bc" suffix for // the unoptimized bitcode so that it does not get overwritten by the ".bc" // optimized bitcode output. auto IsAMDRDCInCompilePhase = [](const JobAction &JA, const llvm::opt::DerivedArgList &Args) { // The relocatable compilation in HIP and OpenMP implies -emit-llvm. // Similarly, use a ".tmp.bc" suffix for the unoptimized bitcode // (generated in the compile phase.) const ToolChain *TC = JA.getOffloadingToolChain(); return isa(JA) && ((JA.getOffloadingDeviceKind() == Action::OFK_HIP && Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) || (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC && TC->getTriple().isAMDGPU())); }; if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC && (C.getArgs().hasArg(options::OPT_emit_llvm) || IsAMDRDCInCompilePhase(JA, C.getArgs()))) Suffixed += ".tmp"; Suffixed += '.'; Suffixed += Suffix; NamedOutput = C.getArgs().MakeArgString(Suffixed.c_str()); } // Prepend object file path if -save-temps=obj if (!AtTopLevel && isSaveTempsObj() && C.getArgs().hasArg(options::OPT_o) && JA.getType() != types::TY_PCH) { Arg *FinalOutput = C.getArgs().getLastArg(options::OPT_o); SmallString<128> TempPath(FinalOutput->getValue()); llvm::sys::path::remove_filename(TempPath); StringRef OutputFileName = llvm::sys::path::filename(NamedOutput); llvm::sys::path::append(TempPath, OutputFileName); NamedOutput = C.getArgs().MakeArgString(TempPath.c_str()); } // If we're saving temps and the temp file conflicts with the input file, // then avoid overwriting input file. if (!AtTopLevel && isSaveTempsEnabled() && NamedOutput == BaseName) { bool SameFile = false; SmallString<256> Result; llvm::sys::fs::current_path(Result); llvm::sys::path::append(Result, BaseName); llvm::sys::fs::equivalent(BaseInput, Result.c_str(), SameFile); // Must share the same path to conflict. if (SameFile) { StringRef Name = llvm::sys::path::filename(BaseInput); std::pair Split = Name.split('.'); std::string TmpName = GetTemporaryPath( Split.first, types::getTypeTempSuffix(JA.getType(), IsCLMode())); return C.addTempFile(C.getArgs().MakeArgString(TmpName)); } } // As an annoying special case, PCH generation doesn't strip the pathname. if (JA.getType() == types::TY_PCH && !IsCLMode()) { llvm::sys::path::remove_filename(BasePath); if (BasePath.empty()) BasePath = NamedOutput; else llvm::sys::path::append(BasePath, NamedOutput); return C.addResultFile(C.getArgs().MakeArgString(BasePath.c_str()), &JA); } return C.addResultFile(NamedOutput, &JA); } std::string Driver::GetFilePath(StringRef Name, const ToolChain &TC) const { // Search for Name in a list of paths. auto SearchPaths = [&](const llvm::SmallVectorImpl &P) -> std::optional { // Respect a limited subset of the '-Bprefix' functionality in GCC by // attempting to use this prefix when looking for file paths. for (const auto &Dir : P) { if (Dir.empty()) continue; SmallString<128> P(Dir[0] == '=' ? SysRoot + Dir.substr(1) : Dir); llvm::sys::path::append(P, Name); if (llvm::sys::fs::exists(Twine(P))) return std::string(P); } return std::nullopt; }; if (auto P = SearchPaths(PrefixDirs)) return *P; SmallString<128> R(ResourceDir); llvm::sys::path::append(R, Name); if (llvm::sys::fs::exists(Twine(R))) return std::string(R.str()); SmallString<128> P(TC.getCompilerRTPath()); llvm::sys::path::append(P, Name); if (llvm::sys::fs::exists(Twine(P))) return std::string(P.str()); SmallString<128> D(Dir); llvm::sys::path::append(D, "..", Name); if (llvm::sys::fs::exists(Twine(D))) return std::string(D.str()); if (auto P = SearchPaths(TC.getLibraryPaths())) return *P; if (auto P = SearchPaths(TC.getFilePaths())) return *P; return std::string(Name); } void Driver::generatePrefixedToolNames( StringRef Tool, const ToolChain &TC, SmallVectorImpl &Names) const { // FIXME: Needs a better variable than TargetTriple Names.emplace_back((TargetTriple + "-" + Tool).str()); Names.emplace_back(Tool); } static bool ScanDirForExecutable(SmallString<128> &Dir, StringRef Name) { llvm::sys::path::append(Dir, Name); if (llvm::sys::fs::can_execute(Twine(Dir))) return true; llvm::sys::path::remove_filename(Dir); return false; } std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const { SmallVector TargetSpecificExecutables; generatePrefixedToolNames(Name, TC, TargetSpecificExecutables); // Respect a limited subset of the '-Bprefix' functionality in GCC by // attempting to use this prefix when looking for program paths. for (const auto &PrefixDir : PrefixDirs) { if (llvm::sys::fs::is_directory(PrefixDir)) { SmallString<128> P(PrefixDir); if (ScanDirForExecutable(P, Name)) return std::string(P.str()); } else { SmallString<128> P((PrefixDir + Name).str()); if (llvm::sys::fs::can_execute(Twine(P))) return std::string(P.str()); } } const ToolChain::path_list &List = TC.getProgramPaths(); for (const auto &TargetSpecificExecutable : TargetSpecificExecutables) { // For each possible name of the tool look for it in // program paths first, then the path. // Higher priority names will be first, meaning that // a higher priority name in the path will be found // instead of a lower priority name in the program path. // E.g. -gcc on the path will be found instead // of gcc in the program path for (const auto &Path : List) { SmallString<128> P(Path); if (ScanDirForExecutable(P, TargetSpecificExecutable)) return std::string(P.str()); } // Fall back to the path if (llvm::ErrorOr P = llvm::sys::findProgramByName(TargetSpecificExecutable)) return *P; } return std::string(Name); } std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const { SmallString<128> Path; std::error_code EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, Path); if (EC) { Diag(clang::diag::err_unable_to_make_temp) << EC.message(); return ""; } return std::string(Path.str()); } std::string Driver::GetTemporaryDirectory(StringRef Prefix) const { SmallString<128> Path; std::error_code EC = llvm::sys::fs::createUniqueDirectory(Prefix, Path); if (EC) { Diag(clang::diag::err_unable_to_make_temp) << EC.message(); return ""; } return std::string(Path.str()); } std::string Driver::GetClPchPath(Compilation &C, StringRef BaseName) const { SmallString<128> Output; if (Arg *FpArg = C.getArgs().getLastArg(options::OPT__SLASH_Fp)) { // FIXME: If anybody needs it, implement this obscure rule: // "If you specify a directory without a file name, the default file name // is VCx0.pch., where x is the major version of Visual C++ in use." Output = FpArg->getValue(); // "If you do not specify an extension as part of the path name, an // extension of .pch is assumed. " if (!llvm::sys::path::has_extension(Output)) Output += ".pch"; } else { if (Arg *YcArg = C.getArgs().getLastArg(options::OPT__SLASH_Yc)) Output = YcArg->getValue(); if (Output.empty()) Output = BaseName; llvm::sys::path::replace_extension(Output, ".pch"); } return std::string(Output.str()); } const ToolChain &Driver::getToolChain(const ArgList &Args, const llvm::Triple &Target) const { auto &TC = ToolChains[Target.str()]; if (!TC) { switch (Target.getOS()) { case llvm::Triple::AIX: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Haiku: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Ananas: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::CloudABI: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Darwin: case llvm::Triple::MacOSX: case llvm::Triple::IOS: case llvm::Triple::TvOS: case llvm::Triple::WatchOS: case llvm::Triple::DriverKit: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::DragonFly: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::OpenBSD: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::NetBSD: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::FreeBSD: if (Target.isPPC()) TC = std::make_unique(*this, Target, Args); else TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Minix: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Linux: case llvm::Triple::ELFIAMCU: if (Target.getArch() == llvm::Triple::hexagon) TC = std::make_unique(*this, Target, Args); else if ((Target.getVendor() == llvm::Triple::MipsTechnologies) && !Target.hasEnvironment()) TC = std::make_unique(*this, Target, Args); else if (Target.isPPC()) TC = std::make_unique(*this, Target, Args); else if (Target.getArch() == llvm::Triple::ve) TC = std::make_unique(*this, Target, Args); else if (Target.isOHOSFamily()) TC = std::make_unique(*this, Target, Args); else TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::NaCl: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Fuchsia: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Solaris: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::CUDA: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::AMDHSA: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::AMDPAL: case llvm::Triple::Mesa3D: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Win32: switch (Target.getEnvironment()) { default: if (Target.isOSBinFormatELF()) TC = std::make_unique(*this, Target, Args); else if (Target.isOSBinFormatMachO()) TC = std::make_unique(*this, Target, Args); else TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::GNU: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Itanium: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::MSVC: case llvm::Triple::UnknownEnvironment: if (Args.getLastArgValue(options::OPT_fuse_ld_EQ) .starts_with_insensitive("bfd")) TC = std::make_unique( *this, Target, Args); else TC = std::make_unique(*this, Target, Args); break; } break; case llvm::Triple::PS4: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::PS5: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Contiki: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::Hurd: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::LiteOS: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::ZOS: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::ShaderModel: TC = std::make_unique(*this, Target, Args); break; default: // Of these targets, Hexagon is the only one that might have // an OS of Linux, in which case it got handled above already. switch (Target.getArch()) { case llvm::Triple::tce: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::tcele: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::hexagon: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::lanai: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::xcore: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::wasm32: case llvm::Triple::wasm64: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::avr: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::msp430: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::riscv32: case llvm::Triple::riscv64: if (toolchains::RISCVToolChain::hasGCCToolchain(*this, Args)) TC = std::make_unique(*this, Target, Args); else TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::ve: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::spirv32: case llvm::Triple::spirv64: TC = std::make_unique(*this, Target, Args); break; case llvm::Triple::csky: TC = std::make_unique(*this, Target, Args); break; default: if (Target.getVendor() == llvm::Triple::Myriad) TC = std::make_unique(*this, Target, Args); else if (toolchains::BareMetal::handlesTarget(Target)) TC = std::make_unique(*this, Target, Args); else if (Target.isOSBinFormatELF()) TC = std::make_unique(*this, Target, Args); else if (Target.isOSBinFormatMachO()) TC = std::make_unique(*this, Target, Args); else TC = std::make_unique(*this, Target, Args); } } } return *TC; } const ToolChain &Driver::getOffloadingDeviceToolChain( const ArgList &Args, const llvm::Triple &Target, const ToolChain &HostTC, const Action::OffloadKind &TargetDeviceOffloadKind) const { // Use device / host triples as the key into the ToolChains map because the // device ToolChain we create depends on both. auto &TC = ToolChains[Target.str() + "/" + HostTC.getTriple().str()]; if (!TC) { // Categorized by offload kind > arch rather than OS > arch like // the normal getToolChain call, as it seems a reasonable way to categorize // things. switch (TargetDeviceOffloadKind) { case Action::OFK_HIP: { if (Target.getArch() == llvm::Triple::amdgcn && Target.getVendor() == llvm::Triple::AMD && Target.getOS() == llvm::Triple::AMDHSA) TC = std::make_unique(*this, Target, HostTC, Args); else if (Target.getArch() == llvm::Triple::spirv64 && Target.getVendor() == llvm::Triple::UnknownVendor && Target.getOS() == llvm::Triple::UnknownOS) TC = std::make_unique(*this, Target, HostTC, Args); break; } default: break; } } return *TC; } bool Driver::ShouldUseClangCompiler(const JobAction &JA) const { // Say "no" if there is not exactly one input of a type clang understands. if (JA.size() != 1 || !types::isAcceptedByClang((*JA.input_begin())->getType())) return false; // And say "no" if this is not a kind of action clang understands. if (!isa(JA) && !isa(JA) && !isa(JA) && !isa(JA) && !isa(JA)) return false; return true; } bool Driver::ShouldUseFlangCompiler(const JobAction &JA) const { // Say "no" if there is not exactly one input of a type flang understands. if (JA.size() != 1 || !types::isAcceptedByFlang((*JA.input_begin())->getType())) return false; // And say "no" if this is not a kind of action flang understands. if (!isa(JA) && !isa(JA) && !isa(JA)) return false; return true; } bool Driver::ShouldEmitStaticLibrary(const ArgList &Args) const { // Only emit static library if the flag is set explicitly. if (Args.hasArg(options::OPT_emit_static_lib)) return true; return false; } /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and return the /// grouped values as integers. Numbers which are not provided are set to 0. /// /// \return True if the entire string was parsed (9.2), or all groups were /// parsed (10.3.5extrastuff). bool Driver::GetReleaseVersion(StringRef Str, unsigned &Major, unsigned &Minor, unsigned &Micro, bool &HadExtra) { HadExtra = false; Major = Minor = Micro = 0; if (Str.empty()) return false; if (Str.consumeInteger(10, Major)) return false; if (Str.empty()) return true; if (Str[0] != '.') return false; Str = Str.drop_front(1); if (Str.consumeInteger(10, Minor)) return false; if (Str.empty()) return true; if (Str[0] != '.') return false; Str = Str.drop_front(1); if (Str.consumeInteger(10, Micro)) return false; if (!Str.empty()) HadExtra = true; return true; } /// Parse digits from a string \p Str and fulfill \p Digits with /// the parsed numbers. This method assumes that the max number of /// digits to look for is equal to Digits.size(). /// /// \return True if the entire string was parsed and there are /// no extra characters remaining at the end. bool Driver::GetReleaseVersion(StringRef Str, MutableArrayRef Digits) { if (Str.empty()) return false; unsigned CurDigit = 0; while (CurDigit < Digits.size()) { unsigned Digit; if (Str.consumeInteger(10, Digit)) return false; Digits[CurDigit] = Digit; if (Str.empty()) return true; if (Str[0] != '.') return false; Str = Str.drop_front(1); CurDigit++; } // More digits than requested, bail out... return false; } std::pair Driver::getIncludeExcludeOptionFlagMasks(bool IsClCompatMode) const { unsigned IncludedFlagsBitmask = 0; unsigned ExcludedFlagsBitmask = options::NoDriverOption; if (IsClCompatMode) { // Include CL and Core options. IncludedFlagsBitmask |= options::CLOption; IncludedFlagsBitmask |= options::CLDXCOption; IncludedFlagsBitmask |= options::CoreOption; } else { ExcludedFlagsBitmask |= options::CLOption; } if (IsDXCMode()) { // Include DXC and Core options. IncludedFlagsBitmask |= options::DXCOption; IncludedFlagsBitmask |= options::CLDXCOption; IncludedFlagsBitmask |= options::CoreOption; } else { ExcludedFlagsBitmask |= options::DXCOption; } if (!IsClCompatMode && !IsDXCMode()) ExcludedFlagsBitmask |= options::CLDXCOption; return std::make_pair(IncludedFlagsBitmask, ExcludedFlagsBitmask); } const char *Driver::getExecutableForDriverMode(DriverMode Mode) { switch (Mode) { case GCCMode: return "clang"; case GXXMode: return "clang++"; case CPPMode: return "clang-cpp"; case CLMode: return "clang-cl"; case FlangMode: return "flang"; case DXCMode: return "clang-dxc"; } llvm_unreachable("Unhandled Mode"); } bool clang::driver::isOptimizationLevelFast(const ArgList &Args) { return Args.hasFlag(options::OPT_Ofast, options::OPT_O_Group, false); } bool clang::driver::willEmitRemarks(const ArgList &Args) { // -fsave-optimization-record enables it. if (Args.hasFlag(options::OPT_fsave_optimization_record, options::OPT_fno_save_optimization_record, false)) return true; // -fsave-optimization-record= enables it as well. if (Args.hasFlag(options::OPT_fsave_optimization_record_EQ, options::OPT_fno_save_optimization_record, false)) return true; // -foptimization-record-file alone enables it too. if (Args.hasFlag(options::OPT_foptimization_record_file_EQ, options::OPT_fno_save_optimization_record, false)) return true; // -foptimization-record-passes alone enables it too. if (Args.hasFlag(options::OPT_foptimization_record_passes_EQ, options::OPT_fno_save_optimization_record, false)) return true; return false; } llvm::StringRef clang::driver::getDriverMode(StringRef ProgName, ArrayRef Args) { static const std::string OptName = getDriverOptTable().getOption(options::OPT_driver_mode).getPrefixedName(); llvm::StringRef Opt; for (StringRef Arg : Args) { if (!Arg.startswith(OptName)) continue; Opt = Arg; } if (Opt.empty()) Opt = ToolChain::getTargetAndModeFromProgramName(ProgName).DriverMode; return Opt.consume_front(OptName) ? Opt : ""; } bool driver::IsClangCL(StringRef DriverMode) { return DriverMode.equals("cl"); } llvm::Error driver::expandResponseFiles(SmallVectorImpl &Args, bool ClangCLMode, llvm::BumpPtrAllocator &Alloc, llvm::vfs::FileSystem *FS) { // Parse response files using the GNU syntax, unless we're in CL mode. There // are two ways to put clang in CL compatibility mode: ProgName is either // clang-cl or cl, or --driver-mode=cl is on the command line. The normal // command line parsing can't happen until after response file parsing, so we // have to manually search for a --driver-mode=cl argument the hard way. // Finally, our -cc1 tools don't care which tokenization mode we use because // response files written by clang will tokenize the same way in either mode. enum { Default, POSIX, Windows } RSPQuoting = Default; for (const char *F : Args) { if (strcmp(F, "--rsp-quoting=posix") == 0) RSPQuoting = POSIX; else if (strcmp(F, "--rsp-quoting=windows") == 0) RSPQuoting = Windows; } // Determines whether we want nullptr markers in Args to indicate response // files end-of-lines. We only use this for the /LINK driver argument with // clang-cl.exe on Windows. bool MarkEOLs = ClangCLMode; llvm::cl::TokenizerCallback Tokenizer; if (RSPQuoting == Windows || (RSPQuoting == Default && ClangCLMode)) Tokenizer = &llvm::cl::TokenizeWindowsCommandLine; else Tokenizer = &llvm::cl::TokenizeGNUCommandLine; if (MarkEOLs && Args.size() > 1 && StringRef(Args[1]).startswith("-cc1")) MarkEOLs = false; llvm::cl::ExpansionContext ECtx(Alloc, Tokenizer); ECtx.setMarkEOLs(MarkEOLs); if (FS) ECtx.setVFS(FS); if (llvm::Error Err = ECtx.expandResponseFiles(Args)) return Err; // If -cc1 came from a response file, remove the EOL sentinels. auto FirstArg = llvm::find_if(llvm::drop_begin(Args), [](const char *A) { return A != nullptr; }); if (FirstArg != Args.end() && StringRef(*FirstArg).startswith("-cc1")) { // If -cc1 came from a response file, remove the EOL sentinels. if (MarkEOLs) { auto newEnd = std::remove(Args.begin(), Args.end(), nullptr); Args.resize(newEnd - Args.begin()); } } return llvm::Error::success(); } diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index 97217eba9ca0..bfc86d9f3471 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -1,450 +1,456 @@ //===--- AIX.cpp - AIX ToolChain Implementations ----------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "AIX.h" #include "Arch/PPC.h" #include "CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Path.h" using AIX = clang::driver::toolchains::AIX; using namespace clang::driver; using namespace clang::driver::tools; using namespace clang::driver::toolchains; using namespace llvm::opt; using namespace llvm::sys; void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { + const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; const bool IsArch32Bit = getToolChain().getTriple().isArch32Bit(); const bool IsArch64Bit = getToolChain().getTriple().isArch64Bit(); // Only support 32 and 64 bit. if (!IsArch32Bit && !IsArch64Bit) llvm_unreachable("Unsupported bit width value."); + if (Arg *A = C.getArgs().getLastArg(options::OPT_G)) { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << A->getSpelling() << D.getTargetTriple(); + } + // Specify the mode in which the as(1) command operates. if (IsArch32Bit) { CmdArgs.push_back("-a32"); } else { // Must be 64-bit, otherwise asserted already. CmdArgs.push_back("-a64"); } // Accept any mixture of instructions. // On Power for AIX and Linux, this behaviour matches that of GCC for both the // user-provided assembler source case and the compiler-produced assembler // source case. Yet XL with user-provided assembler source would not add this. CmdArgs.push_back("-many"); Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); // Specify assembler output file. assert((Output.isFilename() || Output.isNothing()) && "Invalid output."); if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } // Specify assembler input file. // The system assembler on AIX takes exactly one input file. The driver is // expected to invoke as(1) separately for each assembler source input file. if (Inputs.size() != 1) llvm_unreachable("Invalid number of input files."); const InputInfo &II = Inputs[0]; assert((II.isFilename() || II.isNothing()) && "Invalid input."); if (II.isFilename()) CmdArgs.push_back(II.getFilename()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), Exec, CmdArgs, Inputs, Output)); } // Determine whether there are any linker options that supply an export list // (or equivalent information about what to export) being sent to the linker. static bool hasExportListLinkerOpts(const ArgStringList &CmdArgs) { for (size_t i = 0, Size = CmdArgs.size(); i < Size; ++i) { llvm::StringRef ArgString(CmdArgs[i]); if (ArgString.startswith("-bE:") || ArgString.startswith("-bexport:") || ArgString == "-bexpall" || ArgString == "-bexpfull") return true; // If we split -b option, check the next opt. if (ArgString == "-b" && i + 1 < Size) { ++i; llvm::StringRef ArgNextString(CmdArgs[i]); if (ArgNextString.startswith("E:") || ArgNextString.startswith("export:") || ArgNextString == "expall" || ArgNextString == "expfull") return true; } } return false; } void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const AIX &ToolChain = static_cast(getToolChain()); const Driver &D = ToolChain.getDriver(); ArgStringList CmdArgs; const bool IsArch32Bit = ToolChain.getTriple().isArch32Bit(); const bool IsArch64Bit = ToolChain.getTriple().isArch64Bit(); // Only support 32 and 64 bit. if (!(IsArch32Bit || IsArch64Bit)) llvm_unreachable("Unsupported bit width value."); if (Arg *A = C.getArgs().getLastArg(options::OPT_G)) { D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getSpelling() << D.getTargetTriple(); } // Force static linking when "-static" is present. if (Args.hasArg(options::OPT_static)) CmdArgs.push_back("-bnso"); // Add options for shared libraries. if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-bM:SRE"); CmdArgs.push_back("-bnoentry"); } if (Args.hasFlag(options::OPT_mxcoff_roptr, options::OPT_mno_xcoff_roptr, false)) { if (Args.hasArg(options::OPT_shared)) D.Diag(diag::err_roptr_cannot_build_shared); // The `-mxcoff-roptr` option places constants in RO sections as much as // possible. Then `-bforceimprw` changes such sections to RW if they contain // imported symbols that need to be resolved. CmdArgs.push_back("-bforceimprw"); } // PGO instrumentation generates symbols belonging to special sections, and // the linker needs to place all symbols in a particular section together in // memory; the AIX linker does that under an option. if (Args.hasFlag(options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs, false) || Args.hasFlag(options::OPT_fprofile_generate, options::OPT_fno_profile_generate, false) || Args.hasFlag(options::OPT_fprofile_generate_EQ, options::OPT_fno_profile_generate, false) || Args.hasFlag(options::OPT_fprofile_instr_generate, options::OPT_fno_profile_instr_generate, false) || Args.hasFlag(options::OPT_fprofile_instr_generate_EQ, options::OPT_fno_profile_instr_generate, false) || Args.hasFlag(options::OPT_fcs_profile_generate, options::OPT_fno_profile_generate, false) || Args.hasFlag(options::OPT_fcs_profile_generate_EQ, options::OPT_fno_profile_generate, false) || Args.hasArg(options::OPT_fcreate_profile) || Args.hasArg(options::OPT_coverage)) CmdArgs.push_back("-bdbg:namedsects:ss"); if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) { StringRef BuildId = A->getValue(); if (BuildId[0] != '0' || BuildId[1] != 'x' || BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos) ToolChain.getDriver().Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << BuildId; else { std::string LinkerFlag = "-bdbg:ldrinfo:xcoff_binary_id:0x"; if (BuildId.size() % 2) // Prepend a 0 if odd number of digits. LinkerFlag += "0"; LinkerFlag += BuildId.drop_front(2).lower(); CmdArgs.push_back(Args.MakeArgString(LinkerFlag)); } } // Specify linker output file. assert((Output.isFilename() || Output.isNothing()) && "Invalid output."); if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } // Set linking mode (i.e., 32/64-bit) and the address of // text and data sections based on arch bit width. if (IsArch32Bit) { CmdArgs.push_back("-b32"); CmdArgs.push_back("-bpT:0x10000000"); CmdArgs.push_back("-bpD:0x20000000"); } else { // Must be 64-bit, otherwise asserted already. CmdArgs.push_back("-b64"); CmdArgs.push_back("-bpT:0x100000000"); CmdArgs.push_back("-bpD:0x110000000"); } if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, options::OPT_shared, options::OPT_r)) { auto getCrt0Basename = [&Args, IsArch32Bit] { if (Arg *A = Args.getLastArgNoClaim(options::OPT_p, options::OPT_pg)) { // Enable gprofiling when "-pg" is specified. if (A->getOption().matches(options::OPT_pg)) return IsArch32Bit ? "gcrt0.o" : "gcrt0_64.o"; // Enable profiling when "-p" is specified. return IsArch32Bit ? "mcrt0.o" : "mcrt0_64.o"; } return IsArch32Bit ? "crt0.o" : "crt0_64.o"; }; CmdArgs.push_back( Args.MakeArgString(ToolChain.GetFilePath(getCrt0Basename()))); CmdArgs.push_back(Args.MakeArgString( ToolChain.GetFilePath(IsArch32Bit ? "crti.o" : "crti_64.o"))); } // Collect all static constructor and destructor functions in both C and CXX // language link invocations. This has to come before AddLinkerInputs as the // implied option needs to precede any other '-bcdtors' settings or // '-bnocdtors' that '-Wl' might forward. CmdArgs.push_back("-bcdtors:all:0:s"); // Specify linker input file(s). AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA); if (D.isUsingLTO()) { assert(!Inputs.empty() && "Must have at least one input."); addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], D.getLTOMode() == LTOK_Thin); } if (Args.hasArg(options::OPT_shared) && !hasExportListLinkerOpts(CmdArgs)) { const char *CreateExportListExec = Args.MakeArgString( path::parent_path(ToolChain.getDriver().ClangExecutable) + "/llvm-nm"); ArgStringList CreateExportCmdArgs; std::string CreateExportListPath = C.getDriver().GetTemporaryPath("CreateExportList", "exp"); const char *ExportList = C.addTempFile(C.getArgs().MakeArgString(CreateExportListPath)); for (const auto &II : Inputs) if (II.isFilename()) CreateExportCmdArgs.push_back(II.getFilename()); CreateExportCmdArgs.push_back("--export-symbols"); CreateExportCmdArgs.push_back("-X"); if (IsArch32Bit) { CreateExportCmdArgs.push_back("32"); } else { // Must be 64-bit, otherwise asserted already. CreateExportCmdArgs.push_back("64"); } auto ExpCommand = std::make_unique( JA, *this, ResponseFileSupport::None(), CreateExportListExec, CreateExportCmdArgs, Inputs, Output); ExpCommand->setRedirectFiles( {std::nullopt, std::string(ExportList), std::nullopt}); C.addCommand(std::move(ExpCommand)); CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-bE:") + ExportList)); } // Add directory to library search path. Args.AddAllArgs(CmdArgs, options::OPT_L); if (!Args.hasArg(options::OPT_r)) { ToolChain.AddFilePathLibArgs(Args, CmdArgs); ToolChain.addProfileRTLibs(Args, CmdArgs); if (getToolChain().ShouldLinkCXXStdlib(Args)) getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { AddRunTimeLibs(ToolChain, D, CmdArgs, Args); // Add OpenMP runtime if -fopenmp is specified. if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, options::OPT_fno_openmp, false)) { switch (ToolChain.getDriver().getOpenMPRuntime(Args)) { case Driver::OMPRT_OMP: CmdArgs.push_back("-lomp"); break; case Driver::OMPRT_IOMP5: CmdArgs.push_back("-liomp5"); break; case Driver::OMPRT_GOMP: CmdArgs.push_back("-lgomp"); break; case Driver::OMPRT_Unknown: // Already diagnosed. break; } } // Support POSIX threads if "-pthreads" or "-pthread" is present. if (Args.hasArg(options::OPT_pthreads, options::OPT_pthread)) CmdArgs.push_back("-lpthreads"); if (D.CCCIsCXX()) CmdArgs.push_back("-lm"); CmdArgs.push_back("-lc"); if (Args.hasArgNoClaim(options::OPT_p, options::OPT_pg)) { CmdArgs.push_back(Args.MakeArgString((llvm::Twine("-L") + D.SysRoot) + "/lib/profiled")); CmdArgs.push_back(Args.MakeArgString((llvm::Twine("-L") + D.SysRoot) + "/usr/lib/profiled")); } } } const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath()); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), Exec, CmdArgs, Inputs, Output)); } /// AIX - AIX tool chain which can call as(1) and ld(1) directly. AIX::AIX(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : ToolChain(D, Triple, Args) { getProgramPaths().push_back(getDriver().getInstalledDir()); if (getDriver().getInstalledDir() != getDriver().Dir) getProgramPaths().push_back(getDriver().Dir); ParseInlineAsmUsingAsmParser = Args.hasFlag( options::OPT_fintegrated_as, options::OPT_fno_integrated_as, true); getLibraryPaths().push_back(getDriver().SysRoot + "/usr/lib"); } // Returns the effective header sysroot path to use. // This comes from either -isysroot or --sysroot. llvm::StringRef AIX::GetHeaderSysroot(const llvm::opt::ArgList &DriverArgs) const { if (DriverArgs.hasArg(options::OPT_isysroot)) return DriverArgs.getLastArgValue(options::OPT_isysroot); if (!getDriver().SysRoot.empty()) return getDriver().SysRoot; return "/"; } void AIX::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { // Return if -nostdinc is specified as a driver option. if (DriverArgs.hasArg(options::OPT_nostdinc)) return; llvm::StringRef Sysroot = GetHeaderSysroot(DriverArgs); const Driver &D = getDriver(); if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { SmallString<128> P(D.ResourceDir); // Add the PowerPC intrinsic headers (/include/ppc_wrappers) path::append(P, "include", "ppc_wrappers"); addSystemInclude(DriverArgs, CC1Args, P); // Add the Clang builtin headers (/include) addSystemInclude(DriverArgs, CC1Args, path::parent_path(P.str())); } // Return if -nostdlibinc is specified as a driver option. if (DriverArgs.hasArg(options::OPT_nostdlibinc)) return; // Add /usr/include. SmallString<128> UP(Sysroot); path::append(UP, "/usr/include"); addSystemInclude(DriverArgs, CC1Args, UP.str()); } void AIX::AddClangCXXStdlibIncludeArgs( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const { if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdincxx) || DriverArgs.hasArg(options::OPT_nostdlibinc)) return; switch (GetCXXStdlibType(DriverArgs)) { case ToolChain::CST_Libstdcxx: llvm::report_fatal_error( "picking up libstdc++ headers is unimplemented on AIX"); case ToolChain::CST_Libcxx: { llvm::StringRef Sysroot = GetHeaderSysroot(DriverArgs); SmallString<128> PathCPP(Sysroot); llvm::sys::path::append(PathCPP, "opt/IBM/openxlCSDK", "include", "c++", "v1"); addSystemInclude(DriverArgs, CC1Args, PathCPP.str()); // Required in order to suppress conflicting C++ overloads in the system // libc headers that were used by XL C++. CC1Args.push_back("-D__LIBC_NO_CPP_MATH_OVERLOADS__"); return; } } llvm_unreachable("Unexpected C++ library type; only libc++ is supported."); } void AIX::AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { switch (GetCXXStdlibType(Args)) { case ToolChain::CST_Libstdcxx: llvm::report_fatal_error("linking libstdc++ unimplemented on AIX"); case ToolChain::CST_Libcxx: CmdArgs.push_back("-lc++"); if (Args.hasArg(options::OPT_fexperimental_library)) CmdArgs.push_back("-lc++experimental"); CmdArgs.push_back("-lc++abi"); return; } llvm_unreachable("Unexpected C++ library type; only libc++ is supported."); } void AIX::addClangTargetOptions( const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadingKind) const { Args.AddLastArg(CC1Args, options::OPT_mignore_xcoff_visibility); Args.AddLastArg(CC1Args, options::OPT_mdefault_visibility_export_mapping_EQ); Args.addOptInFlag(CC1Args, options::OPT_mxcoff_roptr, options::OPT_mno_xcoff_roptr); if (Args.hasFlag(options::OPT_fxl_pragma_pack, options::OPT_fno_xl_pragma_pack, true)) CC1Args.push_back("-fxl-pragma-pack"); } void AIX::addProfileRTLibs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { // Add linker option -u__llvm_profile_runtime to cause runtime // initialization to occur. if (needsProfileRT(Args)) CmdArgs.push_back(Args.MakeArgString( Twine("-u", llvm::getInstrProfRuntimeHookVarName()))); ToolChain::addProfileRTLibs(Args, CmdArgs); } ToolChain::CXXStdlibType AIX::GetDefaultCXXStdlibType() const { return ToolChain::CST_Libcxx; } ToolChain::RuntimeLibType AIX::GetDefaultRuntimeLibType() const { return ToolChain::RLT_CompilerRT; } auto AIX::buildAssembler() const -> Tool * { return new aix::Assembler(*this); } auto AIX::buildLinker() const -> Tool * { return new aix::Linker(*this); } diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 4383b8004143..cf2bc63d74ad 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -1,282 +1,276 @@ //===--- X86.cpp - X86 Helpers for Tools ------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "X86.h" #include "ToolChains/CommonArgs.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" using namespace clang::driver; using namespace clang::driver::tools; using namespace clang; using namespace llvm::opt; std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { StringRef CPU = A->getValue(); if (CPU != "native") return std::string(CPU); // FIXME: Reject attempts to use -march=native unless the target matches // the host. CPU = llvm::sys::getHostCPUName(); if (!CPU.empty() && CPU != "generic") return std::string(CPU); } if (const Arg *A = Args.getLastArg(options::OPT__SLASH_arch)) { // Mapping built by looking at lib/Basic's X86TargetInfo::initFeatureMap(). // The keys are case-sensitive; this matches link.exe. // 32-bit and 64-bit /arch: flags. llvm::StringMap ArchMap({ {"AVX", "sandybridge"}, {"AVX2", "haswell"}, {"AVX512F", "knl"}, {"AVX512", "skylake-avx512"}, }); if (Triple.getArch() == llvm::Triple::x86) { // 32-bit-only /arch: flags. ArchMap.insert({ {"IA32", "i386"}, {"SSE", "pentium3"}, {"SSE2", "pentium4"}, }); } StringRef CPU = ArchMap.lookup(A->getValue()); if (CPU.empty()) { std::vector ValidArchs{ArchMap.keys().begin(), ArchMap.keys().end()}; sort(ValidArchs); D.Diag(diag::warn_drv_invalid_arch_name_with_suggestion) << A->getValue() << (Triple.getArch() == llvm::Triple::x86) << join(ValidArchs, ", "); } return std::string(CPU); } // Select the default CPU if none was given (or detection failed). if (!Triple.isX86()) return ""; // This routine is only handling x86 targets. bool Is64Bit = Triple.getArch() == llvm::Triple::x86_64; // FIXME: Need target hooks. if (Triple.isOSDarwin()) { if (Triple.getArchName() == "x86_64h") return "core-avx2"; // macosx10.12 drops support for all pre-Penryn Macs. // Simulators can still run on 10.11 though, like Xcode. if (Triple.isMacOSX() && !Triple.isOSVersionLT(10, 12)) return "penryn"; if (Triple.isDriverKit()) return "nehalem"; // The oldest x86_64 Macs have core2/Merom; the oldest x86 Macs have Yonah. return Is64Bit ? "core2" : "yonah"; } // Set up default CPU name for PS4/PS5 compilers. if (Triple.isPS4()) return "btver2"; if (Triple.isPS5()) return "znver2"; // On Android use targets compatible with gcc if (Triple.isAndroid()) return Is64Bit ? "x86-64" : "i686"; // Everything else goes to x86-64 in 64-bit mode. if (Is64Bit) return "x86-64"; switch (Triple.getOS()) { case llvm::Triple::NetBSD: return "i486"; case llvm::Triple::Haiku: case llvm::Triple::OpenBSD: return "i586"; case llvm::Triple::FreeBSD: return "i686"; default: // Fallback to p4. return "pentium4"; } } void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, const ArgList &Args, - std::vector &Features, bool ForAS) { - if (ForAS) { - // Some target-specific options are only handled in AddX86TargetArgs, which - // is not called by ClangAs::ConstructJob. Claim them here. - Args.claimAllArgs(options::OPT_mfpmath_EQ); - } - + std::vector &Features) { // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or // "ms_abi" as default function attributes. if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { StringRef DefaultAbi = Triple.isOSWindows() ? "ms" : "sysv"; if (A->getValue() != DefaultAbi) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getSpelling() << Triple.getTriple(); } // If -march=native, autodetect the feature list. if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { if (StringRef(A->getValue()) == "native") { llvm::StringMap HostFeatures; if (llvm::sys::getHostCPUFeatures(HostFeatures)) for (auto &F : HostFeatures) Features.push_back( Args.MakeArgString((F.second ? "+" : "-") + F.first())); } } if (Triple.getArchName() == "x86_64h") { // x86_64h implies quite a few of the more modern subtarget features // for Haswell class CPUs, but not all of them. Opt-out of a few. Features.push_back("-rdrnd"); Features.push_back("-aes"); Features.push_back("-pclmul"); Features.push_back("-rtm"); Features.push_back("-fsgsbase"); } const llvm::Triple::ArchType ArchType = Triple.getArch(); // Add features to be compatible with gcc for Android. if (Triple.isAndroid()) { if (ArchType == llvm::Triple::x86_64) { Features.push_back("+sse4.2"); Features.push_back("+popcnt"); Features.push_back("+cx16"); } else Features.push_back("+ssse3"); } // Translate the high level `-mretpoline` flag to the specific target feature // flags. We also detect if the user asked for retpoline external thunks but // failed to ask for retpolines themselves (through any of the different // flags). This is a bit hacky but keeps existing usages working. We should // consider deprecating this and instead warn if the user requests external // retpoline thunks and *doesn't* request some form of retpolines. auto SpectreOpt = clang::driver::options::ID::OPT_INVALID; if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline, options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening)) { if (Args.hasFlag(options::OPT_mretpoline, options::OPT_mno_retpoline, false)) { Features.push_back("+retpoline-indirect-calls"); Features.push_back("+retpoline-indirect-branches"); SpectreOpt = options::OPT_mretpoline; } else if (Args.hasFlag(options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening, false)) { // On x86, speculative load hardening relies on at least using retpolines // for indirect calls. Features.push_back("+retpoline-indirect-calls"); SpectreOpt = options::OPT_mspeculative_load_hardening; } } else if (Args.hasFlag(options::OPT_mretpoline_external_thunk, options::OPT_mno_retpoline_external_thunk, false)) { // FIXME: Add a warning about failing to specify `-mretpoline` and // eventually switch to an error here. Features.push_back("+retpoline-indirect-calls"); Features.push_back("+retpoline-indirect-branches"); SpectreOpt = options::OPT_mretpoline_external_thunk; } auto LVIOpt = clang::driver::options::ID::OPT_INVALID; if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening, false)) { Features.push_back("+lvi-load-hardening"); Features.push_back("+lvi-cfi"); // load hardening implies CFI protection LVIOpt = options::OPT_mlvi_hardening; } else if (Args.hasFlag(options::OPT_mlvi_cfi, options::OPT_mno_lvi_cfi, false)) { Features.push_back("+lvi-cfi"); LVIOpt = options::OPT_mlvi_cfi; } if (Args.hasFlag(options::OPT_m_seses, options::OPT_mno_seses, false)) { if (LVIOpt == options::OPT_mlvi_hardening) D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(options::OPT_mlvi_hardening) << D.getOpts().getOptionName(options::OPT_m_seses); if (SpectreOpt != clang::driver::options::ID::OPT_INVALID) D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(options::OPT_m_seses); Features.push_back("+seses"); if (!Args.hasArg(options::OPT_mno_lvi_cfi)) { Features.push_back("+lvi-cfi"); LVIOpt = options::OPT_mlvi_cfi; } } if (SpectreOpt != clang::driver::options::ID::OPT_INVALID && LVIOpt != clang::driver::options::ID::OPT_INVALID) { D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(LVIOpt); } // Now add any that the user explicitly requested on the command line, // which may override the defaults. for (const Arg *A : Args.filtered(options::OPT_m_x86_Features_Group, options::OPT_mgeneral_regs_only)) { StringRef Name = A->getOption().getName(); A->claim(); // Skip over "-m". assert(Name.startswith("m") && "Invalid feature name."); Name = Name.substr(1); // Replace -mgeneral-regs-only with -x87, -mmx, -sse if (A->getOption().getID() == options::OPT_mgeneral_regs_only) { Features.insert(Features.end(), {"-x87", "-mmx", "-sse"}); continue; } bool IsNegative = Name.startswith("no-"); if (IsNegative) Name = Name.substr(3); Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name)); } // Enable/disable straight line speculation hardening. if (Arg *A = Args.getLastArg(options::OPT_mharden_sls_EQ)) { StringRef Scope = A->getValue(); if (Scope == "all") { Features.push_back("+harden-sls-ijmp"); Features.push_back("+harden-sls-ret"); } else if (Scope == "return") { Features.push_back("+harden-sls-ret"); } else if (Scope == "indirect-jmp") { Features.push_back("+harden-sls-ijmp"); } else if (Scope != "none") { D.Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << Scope; } } // -mno-gather, -mno-scatter support if (Args.hasArg(options::OPT_mno_gather)) Features.push_back("+prefer-no-gather"); if (Args.hasArg(options::OPT_mno_scatter)) Features.push_back("+prefer-no-scatter"); } diff --git a/clang/lib/Driver/ToolChains/Arch/X86.h b/clang/lib/Driver/ToolChains/Arch/X86.h index 762a1fa6f4d5..e07387f3ece3 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.h +++ b/clang/lib/Driver/ToolChains/Arch/X86.h @@ -1,36 +1,36 @@ //===--- X86.h - X86-specific Tool Helpers ----------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ARCH_X86_H #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ARCH_X86_H #include "clang/Driver/Driver.h" #include "llvm/ADT/StringRef.h" #include "llvm/Option/Option.h" #include "llvm/TargetParser/Triple.h" #include #include namespace clang { namespace driver { namespace tools { namespace x86 { std::string getX86TargetCPU(const Driver &D, const llvm::opt::ArgList &Args, const llvm::Triple &Triple); void getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, - std::vector &Features, bool ForAS); + std::vector &Features); } // end namespace x86 } // end namespace target } // end namespace driver } // end namespace clang #endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ARCH_X86_H diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 8766d34eec53..0d6907b8e5c7 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1,2443 +1,2443 @@ //===--- CommonArgs.cpp - Args handling for multiple toolchains -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "CommonArgs.h" #include "Arch/AArch64.h" #include "Arch/ARM.h" #include "Arch/CSKY.h" #include "Arch/LoongArch.h" #include "Arch/M68k.h" #include "Arch/Mips.h" #include "Arch/PPC.h" #include "Arch/RISCV.h" #include "Arch/Sparc.h" #include "Arch/SystemZ.h" #include "Arch/VE.h" #include "Arch/X86.h" #include "HIPAMD.h" #include "Hexagon.h" #include "MSP430.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/ObjCRuntime.h" #include "clang/Basic/Version.h" #include "clang/Config/config.h" #include "clang/Driver/Action.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" #include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Driver/XRayArgs.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Config/llvm-config.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/Option.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compression.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/YAMLParser.h" #include "llvm/TargetParser/Host.h" #include "llvm/TargetParser/TargetParser.h" #include using namespace clang::driver; using namespace clang::driver::tools; using namespace clang; using namespace llvm::opt; static void renderRpassOptions(const ArgList &Args, ArgStringList &CmdArgs, const StringRef PluginOptPrefix) { if (const Arg *A = Args.getLastArg(options::OPT_Rpass_EQ)) CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "-pass-remarks=" + A->getValue())); if (const Arg *A = Args.getLastArg(options::OPT_Rpass_missed_EQ)) CmdArgs.push_back(Args.MakeArgString( Twine(PluginOptPrefix) + "-pass-remarks-missed=" + A->getValue())); if (const Arg *A = Args.getLastArg(options::OPT_Rpass_analysis_EQ)) CmdArgs.push_back(Args.MakeArgString( Twine(PluginOptPrefix) + "-pass-remarks-analysis=" + A->getValue())); } static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs, const llvm::Triple &Triple, const InputInfo &Input, const InputInfo &Output, const StringRef PluginOptPrefix) { StringRef Format = "yaml"; if (const Arg *A = Args.getLastArg(options::OPT_fsave_optimization_record_EQ)) Format = A->getValue(); SmallString<128> F; const Arg *A = Args.getLastArg(options::OPT_foptimization_record_file_EQ); if (A) F = A->getValue(); else if (Output.isFilename()) F = Output.getFilename(); assert(!F.empty() && "Cannot determine remarks output name."); // Append "opt.ld." to the end of the file name. CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "opt-remarks-filename=" + F + ".opt.ld." + Format)); if (const Arg *A = Args.getLastArg(options::OPT_foptimization_record_passes_EQ)) CmdArgs.push_back(Args.MakeArgString( Twine(PluginOptPrefix) + "opt-remarks-passes=" + A->getValue())); CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "opt-remarks-format=" + Format.data())); } static void renderRemarksHotnessOptions(const ArgList &Args, ArgStringList &CmdArgs, const StringRef PluginOptPrefix) { if (Args.hasFlag(options::OPT_fdiagnostics_show_hotness, options::OPT_fno_diagnostics_show_hotness, false)) CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "opt-remarks-with-hotness")); if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_hotness_threshold_EQ)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "opt-remarks-hotness-threshold=" + A->getValue())); } static bool shouldIgnoreUnsupportedTargetFeature(const Arg &TargetFeatureArg, llvm::Triple T, StringRef Processor) { // Warn no-cumode for AMDGCN processors not supporing WGP mode. if (!T.isAMDGPU()) return false; auto GPUKind = T.isAMDGCN() ? llvm::AMDGPU::parseArchAMDGCN(Processor) : llvm::AMDGPU::parseArchR600(Processor); auto GPUFeatures = T.isAMDGCN() ? llvm::AMDGPU::getArchAttrAMDGCN(GPUKind) : llvm::AMDGPU::getArchAttrR600(GPUKind); if (GPUFeatures & llvm::AMDGPU::FEATURE_WGP) return false; return TargetFeatureArg.getOption().matches(options::OPT_mno_cumode); } void tools::addPathIfExists(const Driver &D, const Twine &Path, ToolChain::path_list &Paths) { if (D.getVFS().exists(Path)) Paths.push_back(Path.str()); } void tools::handleTargetFeaturesGroup(const Driver &D, const llvm::Triple &Triple, const ArgList &Args, std::vector &Features, OptSpecifier Group) { std::set Warned; for (const Arg *A : Args.filtered(Group)) { StringRef Name = A->getOption().getName(); A->claim(); // Skip over "-m". assert(Name.startswith("m") && "Invalid feature name."); Name = Name.substr(1); auto Proc = getCPUName(D, Args, Triple); if (shouldIgnoreUnsupportedTargetFeature(*A, Triple, Proc)) { if (Warned.count(Name) == 0) { D.getDiags().Report( clang::diag::warn_drv_unsupported_option_for_processor) << A->getAsString(Args) << Proc; Warned.insert(Name); } continue; } bool IsNegative = Name.startswith("no-"); if (IsNegative) Name = Name.substr(3); Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name)); } } SmallVector tools::unifyTargetFeatures(ArrayRef Features) { // Only add a feature if it hasn't been seen before starting from the end. SmallVector UnifiedFeatures; llvm::DenseSet UsedFeatures; for (StringRef Feature : llvm::reverse(Features)) { if (UsedFeatures.insert(Feature.drop_front()).second) UnifiedFeatures.insert(UnifiedFeatures.begin(), Feature); } return UnifiedFeatures; } void tools::addDirectoryList(const ArgList &Args, ArgStringList &CmdArgs, const char *ArgName, const char *EnvVar) { const char *DirList = ::getenv(EnvVar); bool CombinedArg = false; if (!DirList) return; // Nothing to do. StringRef Name(ArgName); if (Name.equals("-I") || Name.equals("-L") || Name.empty()) CombinedArg = true; StringRef Dirs(DirList); if (Dirs.empty()) // Empty string should not add '.'. return; StringRef::size_type Delim; while ((Delim = Dirs.find(llvm::sys::EnvPathSeparator)) != StringRef::npos) { if (Delim == 0) { // Leading colon. if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + ".")); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back("."); } } else { if (CombinedArg) { CmdArgs.push_back( Args.MakeArgString(std::string(ArgName) + Dirs.substr(0, Delim))); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back(Args.MakeArgString(Dirs.substr(0, Delim))); } } Dirs = Dirs.substr(Delim + 1); } if (Dirs.empty()) { // Trailing colon. if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + ".")); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back("."); } } else { // Add the last path. if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + Dirs)); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back(Args.MakeArgString(Dirs)); } } } void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs, const ArgList &Args, ArgStringList &CmdArgs, const JobAction &JA) { const Driver &D = TC.getDriver(); // Add extra linker input arguments which are not treated as inputs // (constructed via -Xarch_). Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input); // LIBRARY_PATH are included before user inputs and only supported on native // toolchains. if (!TC.isCrossCompiling()) addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH"); for (const auto &II : Inputs) { // If the current tool chain refers to an OpenMP offloading host, we // should ignore inputs that refer to OpenMP offloading devices - // they will be embedded according to a proper linker script. if (auto *IA = II.getAction()) if ((JA.isHostOffloading(Action::OFK_OpenMP) && IA->isDeviceOffloading(Action::OFK_OpenMP))) continue; if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType())) // Don't try to pass LLVM inputs unless we have native support. D.Diag(diag::err_drv_no_linker_llvm_support) << TC.getTripleString(); // Add filenames immediately. if (II.isFilename()) { CmdArgs.push_back(II.getFilename()); continue; } // In some error cases, the input could be Nothing; skip those. if (II.isNothing()) continue; // Otherwise, this is a linker input argument. const Arg &A = II.getInputArg(); // Handle reserved library options. if (A.getOption().matches(options::OPT_Z_reserved_lib_stdcxx)) TC.AddCXXStdlibLibArgs(Args, CmdArgs); else if (A.getOption().matches(options::OPT_Z_reserved_lib_cckext)) TC.AddCCKextLibArgs(Args, CmdArgs); else A.renderAsInput(Args, CmdArgs); } } void tools::addLinkerCompressDebugSectionsOption( const ToolChain &TC, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) { // GNU ld supports --compress-debug-sections=none|zlib|zlib-gnu|zlib-gabi // whereas zlib is an alias to zlib-gabi and zlib-gnu is obsoleted. Therefore // -gz=none|zlib are translated to --compress-debug-sections=none|zlib. -gz // is not translated since ld --compress-debug-sections option requires an // argument. if (const Arg *A = Args.getLastArg(options::OPT_gz_EQ)) { StringRef V = A->getValue(); if (V == "none" || V == "zlib" || V == "zstd") CmdArgs.push_back(Args.MakeArgString("--compress-debug-sections=" + V)); else TC.getDriver().Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << V; } } void tools::AddTargetFeature(const ArgList &Args, std::vector &Features, OptSpecifier OnOpt, OptSpecifier OffOpt, StringRef FeatureName) { if (Arg *A = Args.getLastArg(OnOpt, OffOpt)) { if (A->getOption().matches(OnOpt)) Features.push_back(Args.MakeArgString("+" + FeatureName)); else Features.push_back(Args.MakeArgString("-" + FeatureName)); } } /// Get the (LLVM) name of the AMDGPU gpu we are targeting. static std::string getAMDGPUTargetGPU(const llvm::Triple &T, const ArgList &Args) { Arg *MArch = Args.getLastArg(options::OPT_march_EQ); if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { auto GPUName = getProcessorFromTargetID(T, A->getValue()); return llvm::StringSwitch(GPUName) .Cases("rv630", "rv635", "r600") .Cases("rv610", "rv620", "rs780", "rs880") .Case("rv740", "rv770") .Case("palm", "cedar") .Cases("sumo", "sumo2", "sumo") .Case("hemlock", "cypress") .Case("aruba", "cayman") .Default(GPUName.str()); } if (MArch) return getProcessorFromTargetID(T, MArch->getValue()).str(); return ""; } static std::string getLanaiTargetCPU(const ArgList &Args) { if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { return A->getValue(); } return ""; } /// Get the (LLVM) name of the WebAssembly cpu we are targeting. static StringRef getWebAssemblyTargetCPU(const ArgList &Args) { // If we have -mcpu=, use that. if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { StringRef CPU = A->getValue(); #ifdef __wasm__ // Handle "native" by examining the host. "native" isn't meaningful when // cross compiling, so only support this when the host is also WebAssembly. if (CPU == "native") return llvm::sys::getHostCPUName(); #endif return CPU; } return "generic"; } std::string tools::getCPUName(const Driver &D, const ArgList &Args, const llvm::Triple &T, bool FromAs) { Arg *A; switch (T.getArch()) { default: return ""; case llvm::Triple::aarch64: case llvm::Triple::aarch64_32: case llvm::Triple::aarch64_be: return aarch64::getAArch64TargetCPU(Args, T, A); case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: { StringRef MArch, MCPU; arm::getARMArchCPUFromArgs(Args, MArch, MCPU, FromAs); return arm::getARMTargetCPU(MCPU, MArch, T); } case llvm::Triple::avr: if (const Arg *A = Args.getLastArg(options::OPT_mmcu_EQ)) return A->getValue(); return ""; case llvm::Triple::m68k: return m68k::getM68kTargetCPU(Args); case llvm::Triple::mips: case llvm::Triple::mipsel: case llvm::Triple::mips64: case llvm::Triple::mips64el: { StringRef CPUName; StringRef ABIName; mips::getMipsCPUAndABI(Args, T, CPUName, ABIName); return std::string(CPUName); } case llvm::Triple::nvptx: case llvm::Triple::nvptx64: if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) return A->getValue(); return ""; case llvm::Triple::ppc: case llvm::Triple::ppcle: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: return ppc::getPPCTargetCPU(D, Args, T); case llvm::Triple::csky: if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) return A->getValue(); else if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) return A->getValue(); else return "ck810"; case llvm::Triple::riscv32: case llvm::Triple::riscv64: return riscv::getRISCVTargetCPU(Args, T); case llvm::Triple::bpfel: case llvm::Triple::bpfeb: if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) return A->getValue(); return ""; case llvm::Triple::sparc: case llvm::Triple::sparcel: case llvm::Triple::sparcv9: return sparc::getSparcTargetCPU(D, Args, T); case llvm::Triple::x86: case llvm::Triple::x86_64: return x86::getX86TargetCPU(D, Args, T); case llvm::Triple::hexagon: return "hexagon" + toolchains::HexagonToolChain::GetTargetCPUVersion(Args).str(); case llvm::Triple::lanai: return getLanaiTargetCPU(Args); case llvm::Triple::systemz: return systemz::getSystemZTargetCPU(Args); case llvm::Triple::r600: case llvm::Triple::amdgcn: return getAMDGPUTargetGPU(T, Args); case llvm::Triple::wasm32: case llvm::Triple::wasm64: return std::string(getWebAssemblyTargetCPU(Args)); case llvm::Triple::loongarch32: case llvm::Triple::loongarch64: return loongarch::getLoongArchTargetCPU(Args, T); } } static void getWebAssemblyTargetFeatures(const Driver &D, const llvm::Triple &Triple, const ArgList &Args, std::vector &Features) { handleTargetFeaturesGroup(D, Triple, Args, Features, options::OPT_m_wasm_Features_Group); } void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple, const ArgList &Args, ArgStringList &CmdArgs, bool ForAS, bool IsAux) { std::vector Features; switch (Triple.getArch()) { default: break; case llvm::Triple::mips: case llvm::Triple::mipsel: case llvm::Triple::mips64: case llvm::Triple::mips64el: mips::getMIPSTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: arm::getARMTargetFeatures(D, Triple, Args, Features, ForAS); break; case llvm::Triple::ppc: case llvm::Triple::ppcle: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: ppc::getPPCTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::riscv32: case llvm::Triple::riscv64: riscv::getRISCVTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::systemz: systemz::getSystemZTargetFeatures(D, Args, Features); break; case llvm::Triple::aarch64: case llvm::Triple::aarch64_32: case llvm::Triple::aarch64_be: aarch64::getAArch64TargetFeatures(D, Triple, Args, Features, ForAS); break; case llvm::Triple::x86: case llvm::Triple::x86_64: - x86::getX86TargetFeatures(D, Triple, Args, Features, ForAS); + x86::getX86TargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::hexagon: hexagon::getHexagonTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::wasm32: case llvm::Triple::wasm64: getWebAssemblyTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::sparc: case llvm::Triple::sparcel: case llvm::Triple::sparcv9: sparc::getSparcTargetFeatures(D, Args, Features); break; case llvm::Triple::r600: case llvm::Triple::amdgcn: amdgpu::getAMDGPUTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::nvptx: case llvm::Triple::nvptx64: NVPTX::getNVPTXTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::m68k: m68k::getM68kTargetFeatures(D, Triple, Args, Features); break; case llvm::Triple::msp430: msp430::getMSP430TargetFeatures(D, Args, Features); break; case llvm::Triple::ve: ve::getVETargetFeatures(D, Args, Features); break; case llvm::Triple::csky: csky::getCSKYTargetFeatures(D, Triple, Args, CmdArgs, Features); break; case llvm::Triple::loongarch32: case llvm::Triple::loongarch64: loongarch::getLoongArchTargetFeatures(D, Triple, Args, Features); break; } for (auto Feature : unifyTargetFeatures(Features)) { CmdArgs.push_back(IsAux ? "-aux-target-feature" : "-target-feature"); CmdArgs.push_back(Feature.data()); } } llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) { Arg *LtoJobsArg = Args.getLastArg(options::OPT_flto_jobs_EQ); if (!LtoJobsArg) return {}; if (!llvm::get_threadpool_strategy(LtoJobsArg->getValue())) D.Diag(diag::err_drv_invalid_int_value) << LtoJobsArg->getAsString(Args) << LtoJobsArg->getValue(); return LtoJobsArg->getValue(); } // CloudABI and PS4/PS5 use -ffunction-sections and -fdata-sections by default. bool tools::isUseSeparateSections(const llvm::Triple &Triple) { return Triple.getOS() == llvm::Triple::CloudABI || Triple.isPS(); } void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, const InputInfo &Input, bool IsThinLTO) { const bool IsOSAIX = ToolChain.getTriple().isOSAIX(); const bool IsAMDGCN = ToolChain.getTriple().isAMDGCN(); const char *Linker = Args.MakeArgString(ToolChain.GetLinkerPath()); const Driver &D = ToolChain.getDriver(); if (llvm::sys::path::filename(Linker) != "ld.lld" && llvm::sys::path::stem(Linker) != "ld.lld") { // Tell the linker to load the plugin. This has to come before // AddLinkerInputs as gold requires -plugin and AIX ld requires -bplugin to // come before any -plugin-opt/-bplugin_opt that -Wl might forward. const char *PluginPrefix = IsOSAIX ? "-bplugin:" : ""; const char *PluginName = IsOSAIX ? "/libLTO" : "/LLVMgold"; if (!IsOSAIX) CmdArgs.push_back("-plugin"); #if defined(_WIN32) const char *Suffix = ".dll"; #elif defined(__APPLE__) const char *Suffix = ".dylib"; #else const char *Suffix = ".so"; #endif SmallString<1024> Plugin; llvm::sys::path::native(Twine(D.Dir) + "/../" CLANG_INSTALL_LIBDIR_BASENAME + PluginName + Suffix, Plugin); CmdArgs.push_back(Args.MakeArgString(Twine(PluginPrefix) + Plugin)); } const char *PluginOptPrefix = IsOSAIX ? "-bplugin_opt:" : "-plugin-opt="; const char *ExtraDash = IsOSAIX ? "-" : ""; // Note, this solution is far from perfect, better to encode it into IR // metadata, but this may not be worth it, since it looks like aranges is on // the way out. if (Args.hasArg(options::OPT_gdwarf_aranges)) { CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "-generate-arange-section")); } // Try to pass driver level flags relevant to LTO code generation down to // the plugin. // Handle flags for selecting CPU variants. std::string CPU = getCPUName(D, Args, ToolChain.getTriple()); if (!CPU.empty()) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "mcpu=" + CPU)); if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { // The optimization level matches // CompilerInvocation.cpp:getOptimizationLevel(). StringRef OOpt; if (A->getOption().matches(options::OPT_O4) || A->getOption().matches(options::OPT_Ofast)) OOpt = "3"; else if (A->getOption().matches(options::OPT_O)) { OOpt = A->getValue(); if (OOpt == "g") OOpt = "1"; else if (OOpt == "s" || OOpt == "z") OOpt = "2"; } else if (A->getOption().matches(options::OPT_O0)) OOpt = "0"; if (!OOpt.empty()) { CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "O" + OOpt)); if (IsAMDGCN) CmdArgs.push_back(Args.MakeArgString(Twine("--lto-CGO") + OOpt)); } } if (Args.hasArg(options::OPT_gsplit_dwarf)) CmdArgs.push_back(Args.MakeArgString( Twine(PluginOptPrefix) + "dwo_dir=" + Output.getFilename() + "_dwo")); if (IsThinLTO && !IsOSAIX) CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "thinlto")); else if (IsThinLTO && IsOSAIX) CmdArgs.push_back(Args.MakeArgString(Twine("-bdbg:thinlto"))); StringRef Parallelism = getLTOParallelism(Args, D); if (!Parallelism.empty()) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "jobs=" + Parallelism)); // If an explicit debugger tuning argument appeared, pass it along. if (Arg *A = Args.getLastArg(options::OPT_gTune_Group, options::OPT_ggdbN_Group)) { if (A->getOption().matches(options::OPT_glldb)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-debugger-tune=lldb")); else if (A->getOption().matches(options::OPT_gsce)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-debugger-tune=sce")); else if (A->getOption().matches(options::OPT_gdbx)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-debugger-tune=dbx")); else CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-debugger-tune=gdb")); } if (IsOSAIX) { if (!ToolChain.useIntegratedAs()) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-no-integrated-as=1")); // On AIX, clang assumes strict-dwarf is true if any debug option is // specified, unless it is told explicitly not to assume so. Arg *A = Args.getLastArg(options::OPT_g_Group); bool EnableDebugInfo = A && !A->getOption().matches(options::OPT_g0) && !A->getOption().matches(options::OPT_ggdb0); if (EnableDebugInfo && Args.hasFlag(options::OPT_gstrict_dwarf, options::OPT_gno_strict_dwarf, true)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-strict-dwarf=true")); for (const Arg *A : Args.filtered_reverse(options::OPT_mabi_EQ)) { StringRef V = A->getValue(); if (V == "vec-default") break; if (V == "vec-extabi") { CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-vec-extabi")); break; } } } bool UseSeparateSections = isUseSeparateSections(ToolChain.getEffectiveTriple()); if (Args.hasFlag(options::OPT_ffunction_sections, options::OPT_fno_function_sections, UseSeparateSections)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-function-sections=1")); else if (Args.hasArg(options::OPT_fno_function_sections)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-function-sections=0")); bool DataSectionsTurnedOff = false; if (Args.hasFlag(options::OPT_fdata_sections, options::OPT_fno_data_sections, UseSeparateSections)) { CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-data-sections=1")); } else if (Args.hasArg(options::OPT_fno_data_sections)) { DataSectionsTurnedOff = true; CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-data-sections=0")); } if (Args.hasArg(options::OPT_mxcoff_roptr) || Args.hasArg(options::OPT_mno_xcoff_roptr)) { bool HasRoptr = Args.hasFlag(options::OPT_mxcoff_roptr, options::OPT_mno_xcoff_roptr, false); StringRef OptStr = HasRoptr ? "-mxcoff-roptr" : "-mno-xcoff-roptr"; if (!IsOSAIX) D.Diag(diag::err_drv_unsupported_opt_for_target) << OptStr << ToolChain.getTriple().str(); if (HasRoptr) { // The data sections option is on by default on AIX. We only need to error // out when -fno-data-sections is specified explicitly to turn off data // sections. if (DataSectionsTurnedOff) D.Diag(diag::err_roptr_requires_data_sections); CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-mxcoff-roptr")); } } // Pass an option to enable split machine functions. if (auto *A = Args.getLastArg(options::OPT_fsplit_machine_functions, options::OPT_fno_split_machine_functions)) { if (A->getOption().matches(options::OPT_fsplit_machine_functions)) CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "-split-machine-functions")); } if (Arg *A = getLastProfileSampleUseArg(Args)) { StringRef FName = A->getValue(); if (!llvm::sys::fs::exists(FName)) D.Diag(diag::err_drv_no_such_file) << FName; else CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "sample-profile=" + FName)); } if (auto *CSPGOGenerateArg = getLastCSProfileGenerateArg(Args)) { CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "cs-profile-generate")); if (CSPGOGenerateArg->getOption().matches( options::OPT_fcs_profile_generate_EQ)) { SmallString<128> Path(CSPGOGenerateArg->getValue()); llvm::sys::path::append(Path, "default_%m.profraw"); CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "cs-profile-path=" + Path)); } else CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "cs-profile-path=default_%m.profraw")); } else if (auto *ProfileUseArg = getLastProfileUseArg(Args)) { SmallString<128> Path( ProfileUseArg->getNumValues() == 0 ? "" : ProfileUseArg->getValue()); if (Path.empty() || llvm::sys::fs::is_directory(Path)) llvm::sys::path::append(Path, "default.profdata"); CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "cs-profile-path=" + Path)); } // This controls whether or not we perform JustMyCode instrumentation. if (Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false)) { if (ToolChain.getEffectiveTriple().isOSBinFormatELF()) CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "-enable-jmc-instrument")); else D.Diag(clang::diag::warn_drv_fjmc_for_elf_only); } if (Args.hasFlag(options::OPT_femulated_tls, options::OPT_fno_emulated_tls, ToolChain.getTriple().hasDefaultEmulatedTLS())) { CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-emulated-tls")); } if (Args.hasFlag(options::OPT_fstack_size_section, options::OPT_fno_stack_size_section, false)) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "-stack-size-section")); // Setup statistics file output. SmallString<128> StatsFile = getStatsFileName(Args, Output, Input, D); if (!StatsFile.empty()) CmdArgs.push_back( Args.MakeArgString(Twine(PluginOptPrefix) + "stats-file=" + StatsFile)); // Setup crash diagnostics dir. if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir)) CmdArgs.push_back(Args.MakeArgString( Twine(PluginOptPrefix) + "-crash-diagnostics-dir=" + A->getValue())); addX86AlignBranchArgs(D, Args, CmdArgs, /*IsLTO=*/true, PluginOptPrefix); // Handle remark diagnostics on screen options: '-Rpass-*'. renderRpassOptions(Args, CmdArgs, PluginOptPrefix); // Handle serialized remarks options: '-fsave-optimization-record' // and '-foptimization-record-*'. if (willEmitRemarks(Args)) renderRemarksOptions(Args, CmdArgs, ToolChain.getEffectiveTriple(), Input, Output, PluginOptPrefix); // Handle remarks hotness/threshold related options. renderRemarksHotnessOptions(Args, CmdArgs, PluginOptPrefix); addMachineOutlinerArgs(D, Args, CmdArgs, ToolChain.getEffectiveTriple(), /*IsLTO=*/true, PluginOptPrefix); } void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { // Default to clang lib / lib64 folder, i.e. the same location as device // runtime. SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(TC.getDriver().Dir); llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME); CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath)); } void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { // Enable -frtlib-add-rpath by default for the case of VE. const bool IsVE = TC.getTriple().isVE(); bool DefaultValue = IsVE; if (!Args.hasFlag(options::OPT_frtlib_add_rpath, options::OPT_fno_rtlib_add_rpath, DefaultValue)) return; for (const auto &CandidateRPath : TC.getArchSpecificLibPaths()) { if (TC.getVFS().exists(CandidateRPath)) { CmdArgs.push_back("-rpath"); CmdArgs.push_back(Args.MakeArgString(CandidateRPath)); } } } bool tools::addOpenMPRuntime(ArgStringList &CmdArgs, const ToolChain &TC, const ArgList &Args, bool ForceStaticHostRuntime, bool IsOffloadingHost, bool GompNeedsRT) { if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, options::OPT_fno_openmp, false)) return false; Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args); if (RTKind == Driver::OMPRT_Unknown) // Already diagnosed. return false; if (ForceStaticHostRuntime) CmdArgs.push_back("-Bstatic"); switch (RTKind) { case Driver::OMPRT_OMP: CmdArgs.push_back("-lomp"); break; case Driver::OMPRT_GOMP: CmdArgs.push_back("-lgomp"); break; case Driver::OMPRT_IOMP5: CmdArgs.push_back("-liomp5"); break; case Driver::OMPRT_Unknown: break; } if (ForceStaticHostRuntime) CmdArgs.push_back("-Bdynamic"); if (RTKind == Driver::OMPRT_GOMP && GompNeedsRT) CmdArgs.push_back("-lrt"); if (IsOffloadingHost) CmdArgs.push_back("-lomptarget"); if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib)) CmdArgs.push_back("-lomptarget.devicertl"); addArchSpecificRPath(TC, Args, CmdArgs); addOpenMPRuntimeLibraryPath(TC, Args, CmdArgs); return true; } void tools::addFortranRuntimeLibs(const ToolChain &TC, llvm::opt::ArgStringList &CmdArgs) { if (TC.getTriple().isKnownWindowsMSVCEnvironment()) { CmdArgs.push_back("Fortran_main.lib"); CmdArgs.push_back("FortranRuntime.lib"); CmdArgs.push_back("FortranDecimal.lib"); } else { CmdArgs.push_back("-lFortran_main"); CmdArgs.push_back("-lFortranRuntime"); CmdArgs.push_back("-lFortranDecimal"); } } void tools::addFortranRuntimeLibraryPath(const ToolChain &TC, const llvm::opt::ArgList &Args, ArgStringList &CmdArgs) { // Default to the /../lib directory. This works fine on the // platforms that we have tested so far. We will probably have to re-fine // this in the future. In particular, on some platforms, we may need to use // lib64 instead of lib. SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(TC.getDriver().Dir); llvm::sys::path::append(DefaultLibPath, "lib"); if (TC.getTriple().isKnownWindowsMSVCEnvironment()) CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath)); else CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath)); } static void addSanitizerRuntime(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs, StringRef Sanitizer, bool IsShared, bool IsWhole) { // Wrap any static runtimes that must be forced into executable in // whole-archive. if (IsWhole) CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString( Args, Sanitizer, IsShared ? ToolChain::FT_Shared : ToolChain::FT_Static)); if (IsWhole) CmdArgs.push_back("--no-whole-archive"); if (IsShared) { addArchSpecificRPath(TC, Args, CmdArgs); } } // Tries to use a file with the list of dynamic symbols that need to be exported // from the runtime library. Returns true if the file was found. static bool addSanitizerDynamicList(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs, StringRef Sanitizer) { // Solaris ld defaults to --export-dynamic behaviour but doesn't support // the option, so don't try to pass it. if (TC.getTriple().getOS() == llvm::Triple::Solaris) return true; SmallString<128> SanRT(TC.getCompilerRT(Args, Sanitizer)); if (llvm::sys::fs::exists(SanRT + ".syms")) { CmdArgs.push_back(Args.MakeArgString("--dynamic-list=" + SanRT + ".syms")); return true; } return false; } const char *tools::getAsNeededOption(const ToolChain &TC, bool as_needed) { assert(!TC.getTriple().isOSAIX() && "AIX linker does not support any form of --as-needed option yet."); // While the Solaris 11.2 ld added --as-needed/--no-as-needed as aliases // for the native forms -z ignore/-z record, they are missing in Illumos, // so always use the native form. if (TC.getTriple().isOSSolaris()) return as_needed ? "-zignore" : "-zrecord"; else return as_needed ? "--as-needed" : "--no-as-needed"; } void tools::linkSanitizerRuntimeDeps(const ToolChain &TC, ArgStringList &CmdArgs) { // Force linking against the system libraries sanitizers depends on // (see PR15823 why this is necessary). CmdArgs.push_back(getAsNeededOption(TC, false)); // There's no libpthread or librt on RTEMS & Android. if (TC.getTriple().getOS() != llvm::Triple::RTEMS && !TC.getTriple().isAndroid() && !TC.getTriple().isOHOSFamily()) { CmdArgs.push_back("-lpthread"); if (!TC.getTriple().isOSOpenBSD()) CmdArgs.push_back("-lrt"); } CmdArgs.push_back("-lm"); // There's no libdl on all OSes. if (!TC.getTriple().isOSFreeBSD() && !TC.getTriple().isOSNetBSD() && !TC.getTriple().isOSOpenBSD() && TC.getTriple().getOS() != llvm::Triple::RTEMS) CmdArgs.push_back("-ldl"); // Required for backtrace on some OSes if (TC.getTriple().isOSFreeBSD() || TC.getTriple().isOSNetBSD() || TC.getTriple().isOSOpenBSD()) CmdArgs.push_back("-lexecinfo"); // There is no libresolv on Android, FreeBSD, OpenBSD, etc. On musl // libresolv.a, even if exists, is an empty archive to satisfy POSIX -lresolv // requirement. if (TC.getTriple().isOSLinux() && !TC.getTriple().isAndroid() && !TC.getTriple().isMusl()) CmdArgs.push_back("-lresolv"); } static void collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, SmallVectorImpl &SharedRuntimes, SmallVectorImpl &StaticRuntimes, SmallVectorImpl &NonWholeStaticRuntimes, SmallVectorImpl &HelperStaticRuntimes, SmallVectorImpl &RequiredSymbols) { const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args); // Collect shared runtimes. if (SanArgs.needsSharedRt()) { if (SanArgs.needsAsanRt() && SanArgs.linkRuntimes()) { SharedRuntimes.push_back("asan"); if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid()) HelperStaticRuntimes.push_back("asan-preinit"); } if (SanArgs.needsMemProfRt() && SanArgs.linkRuntimes()) { SharedRuntimes.push_back("memprof"); if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid()) HelperStaticRuntimes.push_back("memprof-preinit"); } if (SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) { if (SanArgs.requiresMinimalRuntime()) SharedRuntimes.push_back("ubsan_minimal"); else SharedRuntimes.push_back("ubsan_standalone"); } if (SanArgs.needsScudoRt() && SanArgs.linkRuntimes()) { SharedRuntimes.push_back("scudo_standalone"); } if (SanArgs.needsTsanRt() && SanArgs.linkRuntimes()) SharedRuntimes.push_back("tsan"); if (SanArgs.needsHwasanRt() && SanArgs.linkRuntimes()) { if (SanArgs.needsHwasanAliasesRt()) SharedRuntimes.push_back("hwasan_aliases"); else SharedRuntimes.push_back("hwasan"); if (!Args.hasArg(options::OPT_shared)) HelperStaticRuntimes.push_back("hwasan-preinit"); } } // The stats_client library is also statically linked into DSOs. if (SanArgs.needsStatsRt() && SanArgs.linkRuntimes()) StaticRuntimes.push_back("stats_client"); // Always link the static runtime regardless of DSO or executable. if (SanArgs.needsAsanRt()) HelperStaticRuntimes.push_back("asan_static"); // Collect static runtimes. if (Args.hasArg(options::OPT_shared)) { // Don't link static runtimes into DSOs. return; } // Each static runtime that has a DSO counterpart above is excluded below, // but runtimes that exist only as static are not affected by needsSharedRt. if (!SanArgs.needsSharedRt() && SanArgs.needsAsanRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("asan"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("asan_cxx"); } if (!SanArgs.needsSharedRt() && SanArgs.needsMemProfRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("memprof"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("memprof_cxx"); } if (!SanArgs.needsSharedRt() && SanArgs.needsHwasanRt() && SanArgs.linkRuntimes()) { if (SanArgs.needsHwasanAliasesRt()) { StaticRuntimes.push_back("hwasan_aliases"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("hwasan_aliases_cxx"); } else { StaticRuntimes.push_back("hwasan"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("hwasan_cxx"); } } if (SanArgs.needsDfsanRt() && SanArgs.linkRuntimes()) StaticRuntimes.push_back("dfsan"); if (SanArgs.needsLsanRt() && SanArgs.linkRuntimes()) StaticRuntimes.push_back("lsan"); if (SanArgs.needsMsanRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("msan"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("msan_cxx"); } if (!SanArgs.needsSharedRt() && SanArgs.needsTsanRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("tsan"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("tsan_cxx"); } if (!SanArgs.needsSharedRt() && SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) { if (SanArgs.requiresMinimalRuntime()) { StaticRuntimes.push_back("ubsan_minimal"); } else { StaticRuntimes.push_back("ubsan_standalone"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("ubsan_standalone_cxx"); } } if (SanArgs.needsSafeStackRt() && SanArgs.linkRuntimes()) { NonWholeStaticRuntimes.push_back("safestack"); RequiredSymbols.push_back("__safestack_init"); } if (!(SanArgs.needsSharedRt() && SanArgs.needsUbsanRt() && SanArgs.linkRuntimes())) { if (SanArgs.needsCfiRt() && SanArgs.linkRuntimes()) StaticRuntimes.push_back("cfi"); if (SanArgs.needsCfiDiagRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("cfi_diag"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("ubsan_standalone_cxx"); } } if (SanArgs.needsStatsRt() && SanArgs.linkRuntimes()) { NonWholeStaticRuntimes.push_back("stats"); RequiredSymbols.push_back("__sanitizer_stats_register"); } if (!SanArgs.needsSharedRt() && SanArgs.needsScudoRt() && SanArgs.linkRuntimes()) { StaticRuntimes.push_back("scudo_standalone"); if (SanArgs.linkCXXRuntimes()) StaticRuntimes.push_back("scudo_standalone_cxx"); } } // Should be called before we add system libraries (C++ ABI, libstdc++/libc++, // C runtime, etc). Returns true if sanitizer system deps need to be linked in. bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { SmallVector SharedRuntimes, StaticRuntimes, NonWholeStaticRuntimes, HelperStaticRuntimes, RequiredSymbols; collectSanitizerRuntimes(TC, Args, SharedRuntimes, StaticRuntimes, NonWholeStaticRuntimes, HelperStaticRuntimes, RequiredSymbols); const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args); // Inject libfuzzer dependencies. if (SanArgs.needsFuzzer() && SanArgs.linkRuntimes() && !Args.hasArg(options::OPT_shared)) { addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer", false, true); if (SanArgs.needsFuzzerInterceptors()) addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false, true); if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) { bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) && !Args.hasArg(options::OPT_static); if (OnlyLibstdcxxStatic) CmdArgs.push_back("-Bstatic"); TC.AddCXXStdlibLibArgs(Args, CmdArgs); if (OnlyLibstdcxxStatic) CmdArgs.push_back("-Bdynamic"); } } for (auto RT : SharedRuntimes) addSanitizerRuntime(TC, Args, CmdArgs, RT, true, false); for (auto RT : HelperStaticRuntimes) addSanitizerRuntime(TC, Args, CmdArgs, RT, false, true); bool AddExportDynamic = false; for (auto RT : StaticRuntimes) { addSanitizerRuntime(TC, Args, CmdArgs, RT, false, true); AddExportDynamic |= !addSanitizerDynamicList(TC, Args, CmdArgs, RT); } for (auto RT : NonWholeStaticRuntimes) { addSanitizerRuntime(TC, Args, CmdArgs, RT, false, false); AddExportDynamic |= !addSanitizerDynamicList(TC, Args, CmdArgs, RT); } for (auto S : RequiredSymbols) { CmdArgs.push_back("-u"); CmdArgs.push_back(Args.MakeArgString(S)); } // If there is a static runtime with no dynamic list, force all the symbols // to be dynamic to be sure we export sanitizer interface functions. if (AddExportDynamic) CmdArgs.push_back("--export-dynamic"); if (SanArgs.hasCrossDsoCfi() && !AddExportDynamic) CmdArgs.push_back("--export-dynamic-symbol=__cfi_check"); if (SanArgs.hasMemTag()) { if (!TC.getTriple().isAndroid()) { TC.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) << "-fsanitize=memtag*" << TC.getTriple().str(); } CmdArgs.push_back( Args.MakeArgString("--android-memtag-mode=" + SanArgs.getMemtagMode())); if (SanArgs.hasMemtagHeap()) CmdArgs.push_back("--android-memtag-heap"); if (SanArgs.hasMemtagStack()) CmdArgs.push_back("--android-memtag-stack"); } return !StaticRuntimes.empty() || !NonWholeStaticRuntimes.empty(); } bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) { if (Args.hasArg(options::OPT_shared)) return false; if (TC.getXRayArgs().needsXRayRt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray")); for (const auto &Mode : TC.getXRayArgs().modeList()) CmdArgs.push_back(TC.getCompilerRTArgString(Args, Mode)); CmdArgs.push_back("--no-whole-archive"); return true; } return false; } void tools::linkXRayRuntimeDeps(const ToolChain &TC, ArgStringList &CmdArgs) { CmdArgs.push_back(getAsNeededOption(TC, false)); CmdArgs.push_back("-lpthread"); if (!TC.getTriple().isOSOpenBSD()) CmdArgs.push_back("-lrt"); CmdArgs.push_back("-lm"); if (!TC.getTriple().isOSFreeBSD() && !TC.getTriple().isOSNetBSD() && !TC.getTriple().isOSOpenBSD()) CmdArgs.push_back("-ldl"); } bool tools::areOptimizationsEnabled(const ArgList &Args) { // Find the last -O arg and see if it is non-zero. if (Arg *A = Args.getLastArg(options::OPT_O_Group)) return !A->getOption().matches(options::OPT_O0); // Defaults to -O0. return false; } const char *tools::SplitDebugName(const JobAction &JA, const ArgList &Args, const InputInfo &Input, const InputInfo &Output) { auto AddPostfix = [JA](auto &F) { if (JA.getOffloadingDeviceKind() == Action::OFK_HIP) F += (Twine("_") + JA.getOffloadingArch()).str(); F += ".dwo"; }; if (Arg *A = Args.getLastArg(options::OPT_gsplit_dwarf_EQ)) if (StringRef(A->getValue()) == "single" && Output.isFilename()) return Args.MakeArgString(Output.getFilename()); SmallString<128> T; if (const Arg *A = Args.getLastArg(options::OPT_dumpdir)) { T = A->getValue(); } else { Arg *FinalOutput = Args.getLastArg(options::OPT_o, options::OPT__SLASH_o); if (FinalOutput && Args.hasArg(options::OPT_c)) { T = FinalOutput->getValue(); llvm::sys::path::remove_filename(T); llvm::sys::path::append(T, llvm::sys::path::stem(FinalOutput->getValue())); AddPostfix(T); return Args.MakeArgString(T); } } T += llvm::sys::path::stem(Input.getBaseInput()); AddPostfix(T); return Args.MakeArgString(T); } void tools::SplitDebugInfo(const ToolChain &TC, Compilation &C, const Tool &T, const JobAction &JA, const ArgList &Args, const InputInfo &Output, const char *OutFile) { ArgStringList ExtractArgs; ExtractArgs.push_back("--extract-dwo"); ArgStringList StripArgs; StripArgs.push_back("--strip-dwo"); // Grabbing the output of the earlier compile step. StripArgs.push_back(Output.getFilename()); ExtractArgs.push_back(Output.getFilename()); ExtractArgs.push_back(OutFile); const char *Exec = Args.MakeArgString(TC.GetProgramPath(CLANG_DEFAULT_OBJCOPY)); InputInfo II(types::TY_Object, Output.getFilename(), Output.getFilename()); // First extract the dwo sections. C.addCommand(std::make_unique(JA, T, ResponseFileSupport::AtFileCurCP(), Exec, ExtractArgs, II, Output)); // Then remove them from the original .o file. C.addCommand(std::make_unique( JA, T, ResponseFileSupport::AtFileCurCP(), Exec, StripArgs, II, Output)); } // Claim options we don't want to warn if they are unused. We do this for // options that build systems might add but are unused when assembling or only // running the preprocessor for example. void tools::claimNoWarnArgs(const ArgList &Args) { // Don't warn about unused -f(no-)?lto. This can happen when we're // preprocessing, precompiling or assembling. Args.ClaimAllArgs(options::OPT_flto_EQ); Args.ClaimAllArgs(options::OPT_flto); Args.ClaimAllArgs(options::OPT_fno_lto); } Arg *tools::getLastCSProfileGenerateArg(const ArgList &Args) { auto *CSPGOGenerateArg = Args.getLastArg(options::OPT_fcs_profile_generate, options::OPT_fcs_profile_generate_EQ, options::OPT_fno_profile_generate); if (CSPGOGenerateArg && CSPGOGenerateArg->getOption().matches(options::OPT_fno_profile_generate)) CSPGOGenerateArg = nullptr; return CSPGOGenerateArg; } Arg *tools::getLastProfileUseArg(const ArgList &Args) { auto *ProfileUseArg = Args.getLastArg( options::OPT_fprofile_instr_use, options::OPT_fprofile_instr_use_EQ, options::OPT_fprofile_use, options::OPT_fprofile_use_EQ, options::OPT_fno_profile_instr_use); if (ProfileUseArg && ProfileUseArg->getOption().matches(options::OPT_fno_profile_instr_use)) ProfileUseArg = nullptr; return ProfileUseArg; } Arg *tools::getLastProfileSampleUseArg(const ArgList &Args) { auto *ProfileSampleUseArg = Args.getLastArg( options::OPT_fprofile_sample_use, options::OPT_fprofile_sample_use_EQ, options::OPT_fauto_profile, options::OPT_fauto_profile_EQ, options::OPT_fno_profile_sample_use, options::OPT_fno_auto_profile); if (ProfileSampleUseArg && (ProfileSampleUseArg->getOption().matches( options::OPT_fno_profile_sample_use) || ProfileSampleUseArg->getOption().matches(options::OPT_fno_auto_profile))) return nullptr; return Args.getLastArg(options::OPT_fprofile_sample_use_EQ, options::OPT_fauto_profile_EQ); } const char *tools::RelocationModelName(llvm::Reloc::Model Model) { switch (Model) { case llvm::Reloc::Static: return "static"; case llvm::Reloc::PIC_: return "pic"; case llvm::Reloc::DynamicNoPIC: return "dynamic-no-pic"; case llvm::Reloc::ROPI: return "ropi"; case llvm::Reloc::RWPI: return "rwpi"; case llvm::Reloc::ROPI_RWPI: return "ropi-rwpi"; } llvm_unreachable("Unknown Reloc::Model kind"); } /// Parses the various -fpic/-fPIC/-fpie/-fPIE arguments. Then, /// smooshes them together with platform defaults, to decide whether /// this compile should be using PIC mode or not. Returns a tuple of /// (RelocationModel, PICLevel, IsPIE). std::tuple tools::ParsePICArgs(const ToolChain &ToolChain, const ArgList &Args) { const llvm::Triple &EffectiveTriple = ToolChain.getEffectiveTriple(); const llvm::Triple &Triple = ToolChain.getTriple(); bool PIE = ToolChain.isPIEDefault(Args); bool PIC = PIE || ToolChain.isPICDefault(); // The Darwin/MachO default to use PIC does not apply when using -static. if (Triple.isOSBinFormatMachO() && Args.hasArg(options::OPT_static)) PIE = PIC = false; bool IsPICLevelTwo = PIC; bool KernelOrKext = Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext); // Android-specific defaults for PIC/PIE if (Triple.isAndroid()) { switch (Triple.getArch()) { case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: case llvm::Triple::aarch64: case llvm::Triple::mips: case llvm::Triple::mipsel: case llvm::Triple::mips64: case llvm::Triple::mips64el: PIC = true; // "-fpic" break; case llvm::Triple::x86: case llvm::Triple::x86_64: PIC = true; // "-fPIC" IsPICLevelTwo = true; break; default: break; } } // OHOS-specific defaults for PIC/PIE if (Triple.isOHOSFamily() && Triple.getArch() == llvm::Triple::aarch64) PIC = true; // OpenBSD-specific defaults for PIE if (Triple.isOSOpenBSD()) { switch (ToolChain.getArch()) { case llvm::Triple::arm: case llvm::Triple::aarch64: case llvm::Triple::mips64: case llvm::Triple::mips64el: case llvm::Triple::x86: case llvm::Triple::x86_64: IsPICLevelTwo = false; // "-fpie" break; case llvm::Triple::ppc: case llvm::Triple::sparcv9: IsPICLevelTwo = true; // "-fPIE" break; default: break; } } // The last argument relating to either PIC or PIE wins, and no // other argument is used. If the last argument is any flavor of the // '-fno-...' arguments, both PIC and PIE are disabled. Any PIE // option implicitly enables PIC at the same level. Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC, options::OPT_fpic, options::OPT_fno_pic, options::OPT_fPIE, options::OPT_fno_PIE, options::OPT_fpie, options::OPT_fno_pie); if (Triple.isOSWindows() && !Triple.isOSCygMing() && LastPICArg && LastPICArg == Args.getLastArg(options::OPT_fPIC, options::OPT_fpic, options::OPT_fPIE, options::OPT_fpie)) { ToolChain.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) << LastPICArg->getSpelling() << Triple.str(); if (Triple.getArch() == llvm::Triple::x86_64) return std::make_tuple(llvm::Reloc::PIC_, 2U, false); return std::make_tuple(llvm::Reloc::Static, 0U, false); } // Check whether the tool chain trumps the PIC-ness decision. If the PIC-ness // is forced, then neither PIC nor PIE flags will have no effect. if (!ToolChain.isPICDefaultForced()) { if (LastPICArg) { Option O = LastPICArg->getOption(); if (O.matches(options::OPT_fPIC) || O.matches(options::OPT_fpic) || O.matches(options::OPT_fPIE) || O.matches(options::OPT_fpie)) { PIE = O.matches(options::OPT_fPIE) || O.matches(options::OPT_fpie); PIC = PIE || O.matches(options::OPT_fPIC) || O.matches(options::OPT_fpic); IsPICLevelTwo = O.matches(options::OPT_fPIE) || O.matches(options::OPT_fPIC); } else { PIE = PIC = false; if (EffectiveTriple.isPS()) { Arg *ModelArg = Args.getLastArg(options::OPT_mcmodel_EQ); StringRef Model = ModelArg ? ModelArg->getValue() : ""; if (Model != "kernel") { PIC = true; ToolChain.getDriver().Diag(diag::warn_drv_ps_force_pic) << LastPICArg->getSpelling() << (EffectiveTriple.isPS4() ? "PS4" : "PS5"); } } } } } // Introduce a Darwin and PS4/PS5-specific hack. If the default is PIC, but // the PIC level would've been set to level 1, force it back to level 2 PIC // instead. if (PIC && (Triple.isOSDarwin() || EffectiveTriple.isPS())) IsPICLevelTwo |= ToolChain.isPICDefault(); // This kernel flags are a trump-card: they will disable PIC/PIE // generation, independent of the argument order. if (KernelOrKext && ((!EffectiveTriple.isiOS() || EffectiveTriple.isOSVersionLT(6)) && !EffectiveTriple.isWatchOS() && !EffectiveTriple.isDriverKit())) PIC = PIE = false; if (Arg *A = Args.getLastArg(options::OPT_mdynamic_no_pic)) { // This is a very special mode. It trumps the other modes, almost no one // uses it, and it isn't even valid on any OS but Darwin. if (!Triple.isOSDarwin()) ToolChain.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) << A->getSpelling() << Triple.str(); // FIXME: Warn when this flag trumps some other PIC or PIE flag. // Only a forced PIC mode can cause the actual compile to have PIC defines // etc., no flags are sufficient. This behavior was selected to closely // match that of llvm-gcc and Apple GCC before that. PIC = ToolChain.isPICDefault() && ToolChain.isPICDefaultForced(); return std::make_tuple(llvm::Reloc::DynamicNoPIC, PIC ? 2U : 0U, false); } bool EmbeddedPISupported; switch (Triple.getArch()) { case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: EmbeddedPISupported = true; break; default: EmbeddedPISupported = false; break; } bool ROPI = false, RWPI = false; Arg* LastROPIArg = Args.getLastArg(options::OPT_fropi, options::OPT_fno_ropi); if (LastROPIArg && LastROPIArg->getOption().matches(options::OPT_fropi)) { if (!EmbeddedPISupported) ToolChain.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) << LastROPIArg->getSpelling() << Triple.str(); ROPI = true; } Arg *LastRWPIArg = Args.getLastArg(options::OPT_frwpi, options::OPT_fno_rwpi); if (LastRWPIArg && LastRWPIArg->getOption().matches(options::OPT_frwpi)) { if (!EmbeddedPISupported) ToolChain.getDriver().Diag(diag::err_drv_unsupported_opt_for_target) << LastRWPIArg->getSpelling() << Triple.str(); RWPI = true; } // ROPI and RWPI are not compatible with PIC or PIE. if ((ROPI || RWPI) && (PIC || PIE)) ToolChain.getDriver().Diag(diag::err_drv_ropi_rwpi_incompatible_with_pic); if (Triple.isMIPS()) { StringRef CPUName; StringRef ABIName; mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName); // When targeting the N64 ABI, PIC is the default, except in the case // when the -mno-abicalls option is used. In that case we exit // at next check regardless of PIC being set below. if (ABIName == "n64") PIC = true; // When targettng MIPS with -mno-abicalls, it's always static. if(Args.hasArg(options::OPT_mno_abicalls)) return std::make_tuple(llvm::Reloc::Static, 0U, false); // Unlike other architectures, MIPS, even with -fPIC/-mxgot/multigot, // does not use PIC level 2 for historical reasons. IsPICLevelTwo = false; } if (PIC) return std::make_tuple(llvm::Reloc::PIC_, IsPICLevelTwo ? 2U : 1U, PIE); llvm::Reloc::Model RelocM = llvm::Reloc::Static; if (ROPI && RWPI) RelocM = llvm::Reloc::ROPI_RWPI; else if (ROPI) RelocM = llvm::Reloc::ROPI; else if (RWPI) RelocM = llvm::Reloc::RWPI; return std::make_tuple(RelocM, 0U, false); } // `-falign-functions` indicates that the functions should be aligned to a // 16-byte boundary. // // `-falign-functions=1` is the same as `-fno-align-functions`. // // The scalar `n` in `-falign-functions=n` must be an integral value between // [0, 65536]. If the value is not a power-of-two, it will be rounded up to // the nearest power-of-two. // // If we return `0`, the frontend will default to the backend's preferred // alignment. // // NOTE: icc only allows values between [0, 4096]. icc uses `-falign-functions` // to mean `-falign-functions=16`. GCC defaults to the backend's preferred // alignment. For unaligned functions, we default to the backend's preferred // alignment. unsigned tools::ParseFunctionAlignment(const ToolChain &TC, const ArgList &Args) { const Arg *A = Args.getLastArg(options::OPT_falign_functions, options::OPT_falign_functions_EQ, options::OPT_fno_align_functions); if (!A || A->getOption().matches(options::OPT_fno_align_functions)) return 0; if (A->getOption().matches(options::OPT_falign_functions)) return 0; unsigned Value = 0; if (StringRef(A->getValue()).getAsInteger(10, Value) || Value > 65536) TC.getDriver().Diag(diag::err_drv_invalid_int_value) << A->getAsString(Args) << A->getValue(); return Value ? llvm::Log2_32_Ceil(std::min(Value, 65536u)) : Value; } void tools::addDebugInfoKind( ArgStringList &CmdArgs, llvm::codegenoptions::DebugInfoKind DebugInfoKind) { switch (DebugInfoKind) { case llvm::codegenoptions::DebugDirectivesOnly: CmdArgs.push_back("-debug-info-kind=line-directives-only"); break; case llvm::codegenoptions::DebugLineTablesOnly: CmdArgs.push_back("-debug-info-kind=line-tables-only"); break; case llvm::codegenoptions::DebugInfoConstructor: CmdArgs.push_back("-debug-info-kind=constructor"); break; case llvm::codegenoptions::LimitedDebugInfo: CmdArgs.push_back("-debug-info-kind=limited"); break; case llvm::codegenoptions::FullDebugInfo: CmdArgs.push_back("-debug-info-kind=standalone"); break; case llvm::codegenoptions::UnusedTypeInfo: CmdArgs.push_back("-debug-info-kind=unused-types"); break; default: break; } } // Convert an arg of the form "-gN" or "-ggdbN" or one of their aliases // to the corresponding DebugInfoKind. llvm::codegenoptions::DebugInfoKind tools::debugLevelToInfoKind(const Arg &A) { assert(A.getOption().matches(options::OPT_gN_Group) && "Not a -g option that specifies a debug-info level"); if (A.getOption().matches(options::OPT_g0) || A.getOption().matches(options::OPT_ggdb0)) return llvm::codegenoptions::NoDebugInfo; if (A.getOption().matches(options::OPT_gline_tables_only) || A.getOption().matches(options::OPT_ggdb1)) return llvm::codegenoptions::DebugLineTablesOnly; if (A.getOption().matches(options::OPT_gline_directives_only)) return llvm::codegenoptions::DebugDirectivesOnly; return llvm::codegenoptions::DebugInfoConstructor; } static unsigned ParseDebugDefaultVersion(const ToolChain &TC, const ArgList &Args) { const Arg *A = Args.getLastArg(options::OPT_fdebug_default_version); if (!A) return 0; unsigned Value = 0; if (StringRef(A->getValue()).getAsInteger(10, Value) || Value > 5 || Value < 2) TC.getDriver().Diag(diag::err_drv_invalid_int_value) << A->getAsString(Args) << A->getValue(); return Value; } unsigned tools::DwarfVersionNum(StringRef ArgValue) { return llvm::StringSwitch(ArgValue) .Case("-gdwarf-2", 2) .Case("-gdwarf-3", 3) .Case("-gdwarf-4", 4) .Case("-gdwarf-5", 5) .Default(0); } const Arg *tools::getDwarfNArg(const ArgList &Args) { return Args.getLastArg(options::OPT_gdwarf_2, options::OPT_gdwarf_3, options::OPT_gdwarf_4, options::OPT_gdwarf_5, options::OPT_gdwarf); } unsigned tools::getDwarfVersion(const ToolChain &TC, const llvm::opt::ArgList &Args) { unsigned DwarfVersion = ParseDebugDefaultVersion(TC, Args); if (const Arg *GDwarfN = getDwarfNArg(Args)) if (int N = DwarfVersionNum(GDwarfN->getSpelling())) DwarfVersion = N; if (DwarfVersion == 0) { DwarfVersion = TC.GetDefaultDwarfVersion(); assert(DwarfVersion && "toolchain default DWARF version must be nonzero"); } return DwarfVersion; } void tools::AddAssemblerKPIC(const ToolChain &ToolChain, const ArgList &Args, ArgStringList &CmdArgs) { llvm::Reloc::Model RelocationModel; unsigned PICLevel; bool IsPIE; std::tie(RelocationModel, PICLevel, IsPIE) = ParsePICArgs(ToolChain, Args); if (RelocationModel != llvm::Reloc::Static) CmdArgs.push_back("-KPIC"); } /// Determine whether Objective-C automated reference counting is /// enabled. bool tools::isObjCAutoRefCount(const ArgList &Args) { return Args.hasFlag(options::OPT_fobjc_arc, options::OPT_fno_objc_arc, false); } enum class LibGccType { UnspecifiedLibGcc, StaticLibGcc, SharedLibGcc }; static LibGccType getLibGccType(const ToolChain &TC, const Driver &D, const ArgList &Args) { if (Args.hasArg(options::OPT_static_libgcc) || Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_static_pie) || // The Android NDK only provides libunwind.a, not libunwind.so. TC.getTriple().isAndroid()) return LibGccType::StaticLibGcc; if (Args.hasArg(options::OPT_shared_libgcc)) return LibGccType::SharedLibGcc; return LibGccType::UnspecifiedLibGcc; } // Gcc adds libgcc arguments in various ways: // // gcc : -lgcc --as-needed -lgcc_s --no-as-needed // g++ : -lgcc_s -lgcc // gcc shared: -lgcc_s -lgcc // g++ shared: -lgcc_s -lgcc // gcc static: -lgcc -lgcc_eh // g++ static: -lgcc -lgcc_eh // gcc static-pie: -lgcc -lgcc_eh // g++ static-pie: -lgcc -lgcc_eh // // Also, certain targets need additional adjustments. static void AddUnwindLibrary(const ToolChain &TC, const Driver &D, ArgStringList &CmdArgs, const ArgList &Args) { ToolChain::UnwindLibType UNW = TC.GetUnwindLibType(Args); // By default OHOS binaries are linked statically to libunwind. if (TC.getTriple().isOHOSFamily() && UNW == ToolChain::UNW_CompilerRT) { CmdArgs.push_back("-l:libunwind.a"); return; } // Targets that don't use unwind libraries. if ((TC.getTriple().isAndroid() && UNW == ToolChain::UNW_Libgcc) || TC.getTriple().isOSIAMCU() || TC.getTriple().isOSBinFormatWasm() || TC.getTriple().isWindowsMSVCEnvironment() || UNW == ToolChain::UNW_None) return; LibGccType LGT = getLibGccType(TC, D, Args); bool AsNeeded = LGT == LibGccType::UnspecifiedLibGcc && (UNW == ToolChain::UNW_CompilerRT || !D.CCCIsCXX()) && !TC.getTriple().isAndroid() && !TC.getTriple().isOSCygMing() && !TC.getTriple().isOSAIX(); if (AsNeeded) CmdArgs.push_back(getAsNeededOption(TC, true)); switch (UNW) { case ToolChain::UNW_None: return; case ToolChain::UNW_Libgcc: { if (LGT == LibGccType::StaticLibGcc) CmdArgs.push_back("-lgcc_eh"); else CmdArgs.push_back("-lgcc_s"); break; } case ToolChain::UNW_CompilerRT: if (TC.getTriple().isOSAIX()) { // AIX only has libunwind as a shared library. So do not pass // anything in if -static is specified. if (LGT != LibGccType::StaticLibGcc) CmdArgs.push_back("-lunwind"); } else if (LGT == LibGccType::StaticLibGcc) { CmdArgs.push_back("-l:libunwind.a"); } else if (LGT == LibGccType::SharedLibGcc) { if (TC.getTriple().isOSCygMing()) CmdArgs.push_back("-l:libunwind.dll.a"); else CmdArgs.push_back("-l:libunwind.so"); } else { // Let the linker choose between libunwind.so and libunwind.a // depending on what's available, and depending on the -static flag CmdArgs.push_back("-lunwind"); } break; } if (AsNeeded) CmdArgs.push_back(getAsNeededOption(TC, false)); } static void AddLibgcc(const ToolChain &TC, const Driver &D, ArgStringList &CmdArgs, const ArgList &Args) { LibGccType LGT = getLibGccType(TC, D, Args); if (LGT == LibGccType::StaticLibGcc || (LGT == LibGccType::UnspecifiedLibGcc && !D.CCCIsCXX())) CmdArgs.push_back("-lgcc"); AddUnwindLibrary(TC, D, CmdArgs, Args); if (LGT == LibGccType::SharedLibGcc || (LGT == LibGccType::UnspecifiedLibGcc && D.CCCIsCXX())) CmdArgs.push_back("-lgcc"); } void tools::AddRunTimeLibs(const ToolChain &TC, const Driver &D, ArgStringList &CmdArgs, const ArgList &Args) { // Make use of compiler-rt if --rtlib option is used ToolChain::RuntimeLibType RLT = TC.GetRuntimeLibType(Args); switch (RLT) { case ToolChain::RLT_CompilerRT: CmdArgs.push_back(TC.getCompilerRTArgString(Args, "builtins")); AddUnwindLibrary(TC, D, CmdArgs, Args); break; case ToolChain::RLT_Libgcc: // Make sure libgcc is not used under MSVC environment by default if (TC.getTriple().isKnownWindowsMSVCEnvironment()) { // Issue error diagnostic if libgcc is explicitly specified // through command line as --rtlib option argument. Arg *A = Args.getLastArg(options::OPT_rtlib_EQ); if (A && A->getValue() != StringRef("platform")) { TC.getDriver().Diag(diag::err_drv_unsupported_rtlib_for_platform) << A->getValue() << "MSVC"; } } else AddLibgcc(TC, D, CmdArgs, Args); break; } // On Android, the unwinder uses dl_iterate_phdr (or one of // dl_unwind_find_exidx/__gnu_Unwind_Find_exidx on arm32) from libdl.so. For // statically-linked executables, these functions come from libc.a instead. if (TC.getTriple().isAndroid() && !Args.hasArg(options::OPT_static) && !Args.hasArg(options::OPT_static_pie)) CmdArgs.push_back("-ldl"); } SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args, const InputInfo &Output, const InputInfo &Input, const Driver &D) { const Arg *A = Args.getLastArg(options::OPT_save_stats_EQ); if (!A && !D.CCPrintInternalStats) return {}; SmallString<128> StatsFile; if (A) { StringRef SaveStats = A->getValue(); if (SaveStats == "obj" && Output.isFilename()) { StatsFile.assign(Output.getFilename()); llvm::sys::path::remove_filename(StatsFile); } else if (SaveStats != "cwd") { D.Diag(diag::err_drv_invalid_value) << A->getAsString(Args) << SaveStats; return {}; } StringRef BaseName = llvm::sys::path::filename(Input.getBaseInput()); llvm::sys::path::append(StatsFile, BaseName); llvm::sys::path::replace_extension(StatsFile, "stats"); } else { assert(D.CCPrintInternalStats); StatsFile.assign(D.CCPrintInternalStatReportFilename.empty() ? "-" : D.CCPrintInternalStatReportFilename); } return StatsFile; } void tools::addMultilibFlag(bool Enabled, const StringRef Flag, Multilib::flags_list &Flags) { assert(Flag.front() == '-'); if (Enabled) { Flags.push_back(Flag.str()); } else { Flags.push_back(("!" + Flag.substr(1)).str()); } } void tools::addX86AlignBranchArgs(const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, bool IsLTO, const StringRef PluginOptPrefix) { auto addArg = [&, IsLTO](const Twine &Arg) { if (IsLTO) { assert(!PluginOptPrefix.empty() && "Cannot have empty PluginOptPrefix!"); CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + Arg)); } else { CmdArgs.push_back("-mllvm"); CmdArgs.push_back(Args.MakeArgString(Arg)); } }; if (Args.hasArg(options::OPT_mbranches_within_32B_boundaries)) { addArg(Twine("-x86-branches-within-32B-boundaries")); } if (const Arg *A = Args.getLastArg(options::OPT_malign_branch_boundary_EQ)) { StringRef Value = A->getValue(); unsigned Boundary; if (Value.getAsInteger(10, Boundary) || Boundary < 16 || !llvm::isPowerOf2_64(Boundary)) { D.Diag(diag::err_drv_invalid_argument_to_option) << Value << A->getOption().getName(); } else { addArg("-x86-align-branch-boundary=" + Twine(Boundary)); } } if (const Arg *A = Args.getLastArg(options::OPT_malign_branch_EQ)) { std::string AlignBranch; for (StringRef T : A->getValues()) { if (T != "fused" && T != "jcc" && T != "jmp" && T != "call" && T != "ret" && T != "indirect") D.Diag(diag::err_drv_invalid_malign_branch_EQ) << T << "fused, jcc, jmp, call, ret, indirect"; if (!AlignBranch.empty()) AlignBranch += '+'; AlignBranch += T; } addArg("-x86-align-branch=" + Twine(AlignBranch)); } if (const Arg *A = Args.getLastArg(options::OPT_mpad_max_prefix_size_EQ)) { StringRef Value = A->getValue(); unsigned PrefixSize; if (Value.getAsInteger(10, PrefixSize)) { D.Diag(diag::err_drv_invalid_argument_to_option) << Value << A->getOption().getName(); } else { addArg("-x86-pad-max-prefix-size=" + Twine(PrefixSize)); } } } /// SDLSearch: Search for Static Device Library /// The search for SDL bitcode files is consistent with how static host /// libraries are discovered. That is, the -l option triggers a search for /// files in a set of directories called the LINKPATH. The host library search /// procedure looks for a specific filename in the LINKPATH. The filename for /// a host library is lib.a or lib.so. For SDLs, there is an /// ordered-set of filenames that are searched. We call this ordered-set of /// filenames as SEARCH-ORDER. Since an SDL can either be device-type specific, /// architecture specific, or generic across all architectures, a naming /// convention and search order is used where the file name embeds the /// architecture name (nvptx or amdgcn) and the GPU device type /// such as sm_30 and gfx906. is absent in case of /// device-independent SDLs. To reduce congestion in host library directories, /// the search first looks for files in the “libdevice” subdirectory. SDLs that /// are bc files begin with the prefix “lib”. /// /// Machine-code SDLs can also be managed as an archive (*.a file). The /// convention has been to use the prefix “lib”. To avoid confusion with host /// archive libraries, we use prefix "libbc-" for the bitcode SDL archives. /// bool tools::SDLSearch(const Driver &D, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, SmallVector LibraryPaths, std::string Lib, StringRef Arch, StringRef Target, bool isBitCodeSDL, bool postClangLink) { SmallVector SDLs; std::string LibDeviceLoc = "/libdevice"; std::string LibBcPrefix = "/libbc-"; std::string LibPrefix = "/lib"; if (isBitCodeSDL) { // SEARCH-ORDER for Bitcode SDLs: // libdevice/libbc---.a // libbc---.a // libdevice/libbc--.a // libbc--.a // libdevice/libbc-.a // libbc-.a // libdevice/lib--.bc // lib--.bc // libdevice/lib-.bc // lib-.bc // libdevice/lib.bc // lib.bc for (StringRef Base : {LibBcPrefix, LibPrefix}) { const auto *Ext = Base.contains(LibBcPrefix) ? ".a" : ".bc"; for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(), Twine(Lib + "-" + Arch).str(), Twine(Lib).str()}) { SDLs.push_back(Twine(LibDeviceLoc + Base + Suffix + Ext).str()); SDLs.push_back(Twine(Base + Suffix + Ext).str()); } } } else { // SEARCH-ORDER for Machine-code SDLs: // libdevice/lib--.a // lib--.a // libdevice/lib-.a // lib-.a const auto *Ext = ".a"; for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(), Twine(Lib + "-" + Arch).str()}) { SDLs.push_back(Twine(LibDeviceLoc + LibPrefix + Suffix + Ext).str()); SDLs.push_back(Twine(LibPrefix + Suffix + Ext).str()); } } // The CUDA toolchain does not use a global device llvm-link before the LLVM // backend generates ptx. So currently, the use of bitcode SDL for nvptx is // only possible with post-clang-cc1 linking. Clang cc1 has a feature that // will link libraries after clang compilation while the LLVM IR is still in // memory. This utilizes a clang cc1 option called “-mlink-builtin-bitcode”. // This is a clang -cc1 option that is generated by the clang driver. The // option value must a full path to an existing file. bool FoundSDL = false; for (auto LPath : LibraryPaths) { for (auto SDL : SDLs) { auto FullName = Twine(LPath + SDL).str(); if (llvm::sys::fs::exists(FullName)) { if (postClangLink) CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(FullName)); FoundSDL = true; break; } } if (FoundSDL) break; } return FoundSDL; } /// Search if a user provided archive file lib.a exists in any of /// the library paths. If so, add a new command to clang-offload-bundler to /// unbundle this archive and create a temporary device specific archive. Name /// of this SDL is passed to the llvm-link tool. bool tools::GetSDLFromOffloadArchive( Compilation &C, const Driver &D, const Tool &T, const JobAction &JA, const InputInfoList &Inputs, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, SmallVector LibraryPaths, StringRef Lib, StringRef Arch, StringRef Target, bool isBitCodeSDL, bool postClangLink) { // We don't support bitcode archive bundles for nvptx if (isBitCodeSDL && Arch.contains("nvptx")) return false; bool FoundAOB = false; std::string ArchiveOfBundles; llvm::Triple Triple(D.getTargetTriple()); bool IsMSVC = Triple.isWindowsMSVCEnvironment(); auto Ext = IsMSVC ? ".lib" : ".a"; if (!Lib.startswith(":") && !Lib.startswith("-l")) { if (llvm::sys::fs::exists(Lib)) { ArchiveOfBundles = Lib; FoundAOB = true; } } else { if (Lib.startswith("-l")) Lib = Lib.drop_front(2); for (auto LPath : LibraryPaths) { ArchiveOfBundles.clear(); SmallVector AOBFileNames; auto LibFile = (Lib.startswith(":") ? Lib.drop_front() : IsMSVC ? Lib + Ext : "lib" + Lib + Ext) .str(); for (auto Prefix : {"/libdevice/", "/"}) { auto AOB = Twine(LPath + Prefix + LibFile).str(); if (llvm::sys::fs::exists(AOB)) { ArchiveOfBundles = AOB; FoundAOB = true; break; } } if (FoundAOB) break; } } if (!FoundAOB) return false; llvm::file_magic Magic; auto EC = llvm::identify_magic(ArchiveOfBundles, Magic); if (EC || Magic != llvm::file_magic::archive) return false; StringRef Prefix = isBitCodeSDL ? "libbc-" : "lib"; std::string OutputLib = D.GetTemporaryPath(Twine(Prefix + llvm::sys::path::filename(Lib) + "-" + Arch + "-" + Target) .str(), "a"); C.addTempFile(C.getArgs().MakeArgString(OutputLib)); ArgStringList CmdArgs; SmallString<128> DeviceTriple; DeviceTriple += Action::GetOffloadKindName(JA.getOffloadingDeviceKind()); DeviceTriple += '-'; std::string NormalizedTriple = T.getToolChain().getTriple().normalize(); DeviceTriple += NormalizedTriple; if (!Target.empty()) { DeviceTriple += '-'; DeviceTriple += Target; } std::string UnbundleArg("-unbundle"); std::string TypeArg("-type=a"); std::string InputArg("-input=" + ArchiveOfBundles); std::string OffloadArg("-targets=" + std::string(DeviceTriple)); std::string OutputArg("-output=" + OutputLib); const char *UBProgram = DriverArgs.MakeArgString( T.getToolChain().GetProgramPath("clang-offload-bundler")); ArgStringList UBArgs; UBArgs.push_back(C.getArgs().MakeArgString(UnbundleArg)); UBArgs.push_back(C.getArgs().MakeArgString(TypeArg)); UBArgs.push_back(C.getArgs().MakeArgString(InputArg)); UBArgs.push_back(C.getArgs().MakeArgString(OffloadArg)); UBArgs.push_back(C.getArgs().MakeArgString(OutputArg)); // Add this flag to not exit from clang-offload-bundler if no compatible // code object is found in heterogenous archive library. std::string AdditionalArgs("-allow-missing-bundles"); UBArgs.push_back(C.getArgs().MakeArgString(AdditionalArgs)); // Add this flag to treat hip and hipv4 offload kinds as compatible with // openmp offload kind while extracting code objects from a heterogenous // archive library. Vice versa is also considered compatible. std::string HipCompatibleArgs("-hip-openmp-compatible"); UBArgs.push_back(C.getArgs().MakeArgString(HipCompatibleArgs)); C.addCommand(std::make_unique( JA, T, ResponseFileSupport::AtFileCurCP(), UBProgram, UBArgs, Inputs, InputInfo(&JA, C.getArgs().MakeArgString(OutputLib)))); if (postClangLink) CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(OutputLib)); return true; } // Wrapper function used by driver for adding SDLs during link phase. void tools::AddStaticDeviceLibsLinking(Compilation &C, const Tool &T, const JobAction &JA, const InputInfoList &Inputs, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, StringRef Arch, StringRef Target, bool isBitCodeSDL, bool postClangLink) { AddStaticDeviceLibs(&C, &T, &JA, &Inputs, C.getDriver(), DriverArgs, CC1Args, Arch, Target, isBitCodeSDL, postClangLink); } // User defined Static Device Libraries(SDLs) can be passed to clang for // offloading GPU compilers. Like static host libraries, the use of a SDL is // specified with the -l command line option. The primary difference between // host and SDLs is the filenames for SDLs (refer SEARCH-ORDER for Bitcode SDLs // and SEARCH-ORDER for Machine-code SDLs for the naming convention). // SDLs are of following types: // // * Bitcode SDLs: They can either be a *.bc file or an archive of *.bc files. // For NVPTX, these libraries are post-clang linked following each // compilation. For AMDGPU, these libraries are linked one time // during the application link phase. // // * Machine-code SDLs: They are archive files. For AMDGPU, the process for // machine code SDLs is still in development. But they will be linked // by the LLVM tool lld. // // * Bundled objects that contain both host and device codes: Bundled objects // may also contain library code compiled from source. For NVPTX, the // bundle contains cubin. For AMDGPU, the bundle contains bitcode. // // For Bitcode and Machine-code SDLs, current compiler toolchains hardcode the // inclusion of specific SDLs such as math libraries and the OpenMP device // library libomptarget. void tools::AddStaticDeviceLibs(Compilation *C, const Tool *T, const JobAction *JA, const InputInfoList *Inputs, const Driver &D, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, StringRef Arch, StringRef Target, bool isBitCodeSDL, bool postClangLink) { SmallVector LibraryPaths; // Add search directories from LIBRARY_PATH env variable std::optional LibPath = llvm::sys::Process::GetEnv("LIBRARY_PATH"); if (LibPath) { SmallVector Frags; const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); for (StringRef Path : Frags) LibraryPaths.emplace_back(Path.trim()); } // Add directories from user-specified -L options for (std::string Search_Dir : DriverArgs.getAllArgValues(options::OPT_L)) LibraryPaths.emplace_back(Search_Dir); // Add path to lib-debug folders SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(D.Dir); llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME); LibraryPaths.emplace_back(DefaultLibPath.c_str()); // Build list of Static Device Libraries SDLs specified by -l option llvm::SmallSet SDLNames; static const StringRef HostOnlyArchives[] = { "omp", "cudart", "m", "gcc", "gcc_s", "pthread", "hip_hcc"}; for (auto SDLName : DriverArgs.getAllArgValues(options::OPT_l)) { if (!HostOnlyArchives->contains(SDLName)) { SDLNames.insert(std::string("-l") + SDLName); } } for (auto Input : DriverArgs.getAllArgValues(options::OPT_INPUT)) { auto FileName = StringRef(Input); // Clang treats any unknown file types as archives and passes them to the // linker. Files with extension 'lib' are classified as TY_Object by clang // but they are usually archives. It is OK if the file is not really an // archive since GetSDLFromOffloadArchive will check the magic of the file // and only unbundle it if it is really an archive. const StringRef LibFileExt = ".lib"; if (!llvm::sys::path::has_extension(FileName) || types::lookupTypeForExtension( llvm::sys::path::extension(FileName).drop_front()) == types::TY_INVALID || llvm::sys::path::extension(FileName) == LibFileExt) SDLNames.insert(Input); } // The search stops as soon as an SDL file is found. The driver then provides // the full filename of the SDL to the llvm-link command. If no SDL is found // after searching each LINKPATH with SEARCH-ORDER, it is possible that an // archive file lib.a exists and may contain bundled object files. for (auto SDLName : SDLNames) { // This is the only call to SDLSearch if (!SDLSearch(D, DriverArgs, CC1Args, LibraryPaths, SDLName, Arch, Target, isBitCodeSDL, postClangLink)) { GetSDLFromOffloadArchive(*C, D, *T, *JA, *Inputs, DriverArgs, CC1Args, LibraryPaths, SDLName, Arch, Target, isBitCodeSDL, postClangLink); } } } static llvm::opt::Arg * getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) { return Args.getLastArg(options::OPT_mcode_object_version_EQ); } void tools::checkAMDGPUCodeObjectVersion(const Driver &D, const llvm::opt::ArgList &Args) { const unsigned MinCodeObjVer = 2; const unsigned MaxCodeObjVer = 5; if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) { if (CodeObjArg->getOption().getID() == options::OPT_mcode_object_version_EQ) { unsigned CodeObjVer = MaxCodeObjVer; auto Remnant = StringRef(CodeObjArg->getValue()).getAsInteger(0, CodeObjVer); if (Remnant || CodeObjVer < MinCodeObjVer || CodeObjVer > MaxCodeObjVer) D.Diag(diag::err_drv_invalid_int_value) << CodeObjArg->getAsString(Args) << CodeObjArg->getValue(); } } } unsigned tools::getAMDGPUCodeObjectVersion(const Driver &D, const llvm::opt::ArgList &Args) { unsigned CodeObjVer = 4; // default if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) StringRef(CodeObjArg->getValue()).getAsInteger(0, CodeObjVer); return CodeObjVer; } bool tools::haveAMDGPUCodeObjectVersionArgument( const Driver &D, const llvm::opt::ArgList &Args) { return getAMDGPUCodeObjectArgument(D, Args) != nullptr; } void tools::addMachineOutlinerArgs(const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const llvm::Triple &Triple, bool IsLTO, const StringRef PluginOptPrefix) { auto addArg = [&, IsLTO](const Twine &Arg) { if (IsLTO) { assert(!PluginOptPrefix.empty() && "Cannot have empty PluginOptPrefix!"); CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + Arg)); } else { CmdArgs.push_back("-mllvm"); CmdArgs.push_back(Args.MakeArgString(Arg)); } }; if (Arg *A = Args.getLastArg(options::OPT_moutline, options::OPT_mno_outline)) { if (A->getOption().matches(options::OPT_moutline)) { // We only support -moutline in AArch64 and ARM targets right now. If // we're not compiling for these, emit a warning and ignore the flag. // Otherwise, add the proper mllvm flags. if (!(Triple.isARM() || Triple.isThumb() || Triple.getArch() == llvm::Triple::aarch64 || Triple.getArch() == llvm::Triple::aarch64_32)) { D.Diag(diag::warn_drv_moutline_unsupported_opt) << Triple.getArchName(); } else { addArg(Twine("-enable-machine-outliner")); } } else { // Disable all outlining behaviour. addArg(Twine("-enable-machine-outliner=never")); } } } void tools::addOpenMPDeviceRTL(const Driver &D, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, StringRef BitcodeSuffix, const llvm::Triple &Triple) { SmallVector LibraryPaths; // Add path to clang lib / lib64 folder. SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(D.Dir); llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME); LibraryPaths.emplace_back(DefaultLibPath.c_str()); // Add user defined library paths from LIBRARY_PATH. std::optional LibPath = llvm::sys::Process::GetEnv("LIBRARY_PATH"); if (LibPath) { SmallVector Frags; const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); for (StringRef Path : Frags) LibraryPaths.emplace_back(Path.trim()); } OptSpecifier LibomptargetBCPathOpt = Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_bc_path_EQ : options::OPT_libomptarget_nvptx_bc_path_EQ; StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx"; std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str(); // First check whether user specifies bc library if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) { SmallString<128> LibOmpTargetFile(A->getValue()); if (llvm::sys::fs::exists(LibOmpTargetFile) && llvm::sys::fs::is_directory(LibOmpTargetFile)) { llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); } if (llvm::sys::fs::exists(LibOmpTargetFile)) { CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); } else { D.Diag(diag::err_drv_omp_offload_target_bcruntime_not_found) << LibOmpTargetFile; } } else { bool FoundBCLibrary = false; for (StringRef LibraryPath : LibraryPaths) { SmallString<128> LibOmpTargetFile(LibraryPath); llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); if (llvm::sys::fs::exists(LibOmpTargetFile)) { CC1Args.push_back("-mlink-builtin-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); FoundBCLibrary = true; break; } } if (!FoundBCLibrary) D.Diag(diag::err_drv_omp_offload_target_missingbcruntime) << LibOmpTargetName << ArchPrefix; } } void tools::addHIPRuntimeLibArgs(const ToolChain &TC, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) { if (Args.hasArg(options::OPT_hip_link) && !Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_no_hip_rt)) { TC.AddHIPRuntimeLibArgs(Args, CmdArgs); } else { // Claim "no HIP libraries" arguments if any for (auto *Arg : Args.filtered(options::OPT_no_hip_rt)) { Arg->claim(); } } } diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index c6f958a6077b..0bd4b01ff79d 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -1,1231 +1,1236 @@ //===--- FrontendAction.cpp -----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "clang/Frontend/FrontendAction.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/DeclGroup.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/FileEntry.h" #include "clang/Basic/LangStandard.h" #include "clang/Basic/Sarif.h" +#include "clang/Basic/Stack.h" #include "clang/Frontend/ASTUnit.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/FrontendPluginRegistry.h" #include "clang/Frontend/LayoutOverrideSource.h" #include "clang/Frontend/MultiplexConsumer.h" #include "clang/Frontend/SARIFDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/LiteralSupport.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Parse/ParseAST.h" #include "clang/Sema/HLSLExternalSemaSource.h" #include "clang/Sema/MultiplexExternalSemaSource.h" #include "clang/Serialization/ASTDeserializationListener.h" #include "clang/Serialization/ASTReader.h" #include "clang/Serialization/GlobalModuleIndex.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Support/BuryPointer.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include #include using namespace clang; LLVM_INSTANTIATE_REGISTRY(FrontendPluginRegistry) namespace { class DelegatingDeserializationListener : public ASTDeserializationListener { ASTDeserializationListener *Previous; bool DeletePrevious; public: explicit DelegatingDeserializationListener( ASTDeserializationListener *Previous, bool DeletePrevious) : Previous(Previous), DeletePrevious(DeletePrevious) {} ~DelegatingDeserializationListener() override { if (DeletePrevious) delete Previous; } void ReaderInitialized(ASTReader *Reader) override { if (Previous) Previous->ReaderInitialized(Reader); } void IdentifierRead(serialization::IdentID ID, IdentifierInfo *II) override { if (Previous) Previous->IdentifierRead(ID, II); } void TypeRead(serialization::TypeIdx Idx, QualType T) override { if (Previous) Previous->TypeRead(Idx, T); } void DeclRead(serialization::DeclID ID, const Decl *D) override { if (Previous) Previous->DeclRead(ID, D); } void SelectorRead(serialization::SelectorID ID, Selector Sel) override { if (Previous) Previous->SelectorRead(ID, Sel); } void MacroDefinitionRead(serialization::PreprocessedEntityID PPID, MacroDefinitionRecord *MD) override { if (Previous) Previous->MacroDefinitionRead(PPID, MD); } }; /// Dumps deserialized declarations. class DeserializedDeclsDumper : public DelegatingDeserializationListener { public: explicit DeserializedDeclsDumper(ASTDeserializationListener *Previous, bool DeletePrevious) : DelegatingDeserializationListener(Previous, DeletePrevious) {} void DeclRead(serialization::DeclID ID, const Decl *D) override { llvm::outs() << "PCH DECL: " << D->getDeclKindName(); if (const NamedDecl *ND = dyn_cast(D)) { llvm::outs() << " - "; ND->printQualifiedName(llvm::outs()); } llvm::outs() << "\n"; DelegatingDeserializationListener::DeclRead(ID, D); } }; /// Checks deserialized declarations and emits error if a name /// matches one given in command-line using -error-on-deserialized-decl. class DeserializedDeclsChecker : public DelegatingDeserializationListener { ASTContext &Ctx; std::set NamesToCheck; public: DeserializedDeclsChecker(ASTContext &Ctx, const std::set &NamesToCheck, ASTDeserializationListener *Previous, bool DeletePrevious) : DelegatingDeserializationListener(Previous, DeletePrevious), Ctx(Ctx), NamesToCheck(NamesToCheck) {} void DeclRead(serialization::DeclID ID, const Decl *D) override { if (const NamedDecl *ND = dyn_cast(D)) if (NamesToCheck.find(ND->getNameAsString()) != NamesToCheck.end()) { unsigned DiagID = Ctx.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, "%0 was deserialized"); Ctx.getDiagnostics().Report(Ctx.getFullLoc(D->getLocation()), DiagID) << ND; } DelegatingDeserializationListener::DeclRead(ID, D); } }; } // end anonymous namespace FrontendAction::FrontendAction() : Instance(nullptr) {} FrontendAction::~FrontendAction() {} void FrontendAction::setCurrentInput(const FrontendInputFile &CurrentInput, std::unique_ptr AST) { this->CurrentInput = CurrentInput; CurrentASTUnit = std::move(AST); } Module *FrontendAction::getCurrentModule() const { CompilerInstance &CI = getCompilerInstance(); return CI.getPreprocessor().getHeaderSearchInfo().lookupModule( CI.getLangOpts().CurrentModule, SourceLocation(), /*AllowSearch*/false); } std::unique_ptr FrontendAction::CreateWrappedASTConsumer(CompilerInstance &CI, StringRef InFile) { std::unique_ptr Consumer = CreateASTConsumer(CI, InFile); if (!Consumer) return nullptr; // Validate -add-plugin args. bool FoundAllPlugins = true; for (const std::string &Arg : CI.getFrontendOpts().AddPluginActions) { bool Found = false; for (const FrontendPluginRegistry::entry &Plugin : FrontendPluginRegistry::entries()) { if (Plugin.getName() == Arg) Found = true; } if (!Found) { CI.getDiagnostics().Report(diag::err_fe_invalid_plugin_name) << Arg; FoundAllPlugins = false; } } if (!FoundAllPlugins) return nullptr; // If there are no registered plugins we don't need to wrap the consumer if (FrontendPluginRegistry::begin() == FrontendPluginRegistry::end()) return Consumer; // If this is a code completion run, avoid invoking the plugin consumers if (CI.hasCodeCompletionConsumer()) return Consumer; // Collect the list of plugins that go before the main action (in Consumers) // or after it (in AfterConsumers) std::vector> Consumers; std::vector> AfterConsumers; for (const FrontendPluginRegistry::entry &Plugin : FrontendPluginRegistry::entries()) { std::unique_ptr P = Plugin.instantiate(); PluginASTAction::ActionType ActionType = P->getActionType(); if (ActionType == PluginASTAction::CmdlineAfterMainAction || ActionType == PluginASTAction::CmdlineBeforeMainAction) { // This is O(|plugins| * |add_plugins|), but since both numbers are // way below 50 in practice, that's ok. if (llvm::is_contained(CI.getFrontendOpts().AddPluginActions, Plugin.getName())) { if (ActionType == PluginASTAction::CmdlineBeforeMainAction) ActionType = PluginASTAction::AddBeforeMainAction; else ActionType = PluginASTAction::AddAfterMainAction; } } if ((ActionType == PluginASTAction::AddBeforeMainAction || ActionType == PluginASTAction::AddAfterMainAction) && P->ParseArgs( CI, CI.getFrontendOpts().PluginArgs[std::string(Plugin.getName())])) { std::unique_ptr PluginConsumer = P->CreateASTConsumer(CI, InFile); if (ActionType == PluginASTAction::AddBeforeMainAction) { Consumers.push_back(std::move(PluginConsumer)); } else { AfterConsumers.push_back(std::move(PluginConsumer)); } } } // Add to Consumers the main consumer, then all the plugins that go after it Consumers.push_back(std::move(Consumer)); if (!AfterConsumers.empty()) { // If we have plugins after the main consumer, which may be the codegen // action, they likely will need the ASTContext, so don't clear it in the // codegen action. CI.getCodeGenOpts().ClearASTBeforeBackend = false; for (auto &C : AfterConsumers) Consumers.push_back(std::move(C)); } return std::make_unique(std::move(Consumers)); } /// For preprocessed files, if the first line is the linemarker and specifies /// the original source file name, use that name as the input file name. /// Returns the location of the first token after the line marker directive. /// /// \param CI The compiler instance. /// \param InputFile Populated with the filename from the line marker. /// \param IsModuleMap If \c true, add a line note corresponding to this line /// directive. (We need to do this because the directive will not be /// visited by the preprocessor.) static SourceLocation ReadOriginalFileName(CompilerInstance &CI, std::string &InputFile, bool IsModuleMap = false) { auto &SourceMgr = CI.getSourceManager(); auto MainFileID = SourceMgr.getMainFileID(); auto MainFileBuf = SourceMgr.getBufferOrNone(MainFileID); if (!MainFileBuf) return SourceLocation(); std::unique_ptr RawLexer( new Lexer(MainFileID, *MainFileBuf, SourceMgr, CI.getLangOpts())); // If the first line has the syntax of // // # NUM "FILENAME" // // we use FILENAME as the input file name. Token T; if (RawLexer->LexFromRawLexer(T) || T.getKind() != tok::hash) return SourceLocation(); if (RawLexer->LexFromRawLexer(T) || T.isAtStartOfLine() || T.getKind() != tok::numeric_constant) return SourceLocation(); unsigned LineNo; SourceLocation LineNoLoc = T.getLocation(); if (IsModuleMap) { llvm::SmallString<16> Buffer; if (Lexer::getSpelling(LineNoLoc, Buffer, SourceMgr, CI.getLangOpts()) .getAsInteger(10, LineNo)) return SourceLocation(); } RawLexer->LexFromRawLexer(T); if (T.isAtStartOfLine() || T.getKind() != tok::string_literal) return SourceLocation(); StringLiteralParser Literal(T, CI.getPreprocessor()); if (Literal.hadError) return SourceLocation(); RawLexer->LexFromRawLexer(T); if (T.isNot(tok::eof) && !T.isAtStartOfLine()) return SourceLocation(); InputFile = Literal.GetString().str(); if (IsModuleMap) CI.getSourceManager().AddLineNote( LineNoLoc, LineNo, SourceMgr.getLineTableFilenameID(InputFile), false, false, SrcMgr::C_User_ModuleMap); return T.getLocation(); } static SmallVectorImpl & operator+=(SmallVectorImpl &Includes, StringRef RHS) { Includes.append(RHS.begin(), RHS.end()); return Includes; } static void addHeaderInclude(StringRef HeaderName, SmallVectorImpl &Includes, const LangOptions &LangOpts, bool IsExternC) { if (IsExternC && LangOpts.CPlusPlus) Includes += "extern \"C\" {\n"; if (LangOpts.ObjC) Includes += "#import \""; else Includes += "#include \""; Includes += HeaderName; Includes += "\"\n"; if (IsExternC && LangOpts.CPlusPlus) Includes += "}\n"; } /// Collect the set of header includes needed to construct the given /// module and update the TopHeaders file set of the module. /// /// \param Module The module we're collecting includes from. /// /// \param Includes Will be augmented with the set of \#includes or \#imports /// needed to load all of the named headers. static std::error_code collectModuleHeaderIncludes( const LangOptions &LangOpts, FileManager &FileMgr, DiagnosticsEngine &Diag, ModuleMap &ModMap, clang::Module *Module, SmallVectorImpl &Includes) { // Don't collect any headers for unavailable modules. if (!Module->isAvailable()) return std::error_code(); // Resolve all lazy header directives to header files. ModMap.resolveHeaderDirectives(Module, /*File=*/std::nullopt); // If any headers are missing, we can't build this module. In most cases, // diagnostics for this should have already been produced; we only get here // if explicit stat information was provided. // FIXME: If the name resolves to a file with different stat information, // produce a better diagnostic. if (!Module->MissingHeaders.empty()) { auto &MissingHeader = Module->MissingHeaders.front(); Diag.Report(MissingHeader.FileNameLoc, diag::err_module_header_missing) << MissingHeader.IsUmbrella << MissingHeader.FileName; return std::error_code(); } // Add includes for each of these headers. for (auto HK : {Module::HK_Normal, Module::HK_Private}) { for (Module::Header &H : Module->Headers[HK]) { Module->addTopHeader(H.Entry); // Use the path as specified in the module map file. We'll look for this // file relative to the module build directory (the directory containing // the module map file) so this will find the same file that we found // while parsing the module map. addHeaderInclude(H.PathRelativeToRootModuleDirectory, Includes, LangOpts, Module->IsExternC); } } // Note that Module->PrivateHeaders will not be a TopHeader. if (std::optional UmbrellaHeader = Module->getUmbrellaHeaderAsWritten()) { Module->addTopHeader(UmbrellaHeader->Entry); if (Module->Parent) // Include the umbrella header for submodules. addHeaderInclude(UmbrellaHeader->PathRelativeToRootModuleDirectory, Includes, LangOpts, Module->IsExternC); } else if (std::optional UmbrellaDir = Module->getUmbrellaDirAsWritten()) { // Add all of the headers we find in this subdirectory. std::error_code EC; SmallString<128> DirNative; llvm::sys::path::native(UmbrellaDir->Entry.getName(), DirNative); llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); SmallVector, 8> Headers; for (llvm::vfs::recursive_directory_iterator Dir(FS, DirNative, EC), End; Dir != End && !EC; Dir.increment(EC)) { // Check whether this entry has an extension typically associated with // headers. if (!llvm::StringSwitch(llvm::sys::path::extension(Dir->path())) .Cases(".h", ".H", ".hh", ".hpp", true) .Default(false)) continue; auto Header = FileMgr.getOptionalFileRef(Dir->path()); // FIXME: This shouldn't happen unless there is a file system race. Is // that worth diagnosing? if (!Header) continue; // If this header is marked 'unavailable' in this module, don't include // it. if (ModMap.isHeaderUnavailableInModule(*Header, Module)) continue; // Compute the relative path from the directory to this file. SmallVector Components; auto PathIt = llvm::sys::path::rbegin(Dir->path()); for (int I = 0; I != Dir.level() + 1; ++I, ++PathIt) Components.push_back(*PathIt); SmallString<128> RelativeHeader( UmbrellaDir->PathRelativeToRootModuleDirectory); for (auto It = Components.rbegin(), End = Components.rend(); It != End; ++It) llvm::sys::path::append(RelativeHeader, *It); std::string RelName = RelativeHeader.c_str(); Headers.push_back(std::make_pair(RelName, *Header)); } if (EC) return EC; // Sort header paths and make the header inclusion order deterministic // across different OSs and filesystems. llvm::sort(Headers, llvm::less_first()); for (auto &H : Headers) { // Include this header as part of the umbrella directory. Module->addTopHeader(H.second); addHeaderInclude(H.first, Includes, LangOpts, Module->IsExternC); } } // Recurse into submodules. for (auto *Submodule : Module->submodules()) if (std::error_code Err = collectModuleHeaderIncludes( LangOpts, FileMgr, Diag, ModMap, Submodule, Includes)) return Err; return std::error_code(); } static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem, bool IsPreprocessed, std::string &PresumedModuleMapFile, unsigned &Offset) { auto &SrcMgr = CI.getSourceManager(); HeaderSearch &HS = CI.getPreprocessor().getHeaderSearchInfo(); // Map the current input to a file. FileID ModuleMapID = SrcMgr.getMainFileID(); OptionalFileEntryRef ModuleMap = SrcMgr.getFileEntryRefForID(ModuleMapID); assert(ModuleMap && "MainFileID without FileEntry"); // If the module map is preprocessed, handle the initial line marker; // line directives are not part of the module map syntax in general. Offset = 0; if (IsPreprocessed) { SourceLocation EndOfLineMarker = ReadOriginalFileName(CI, PresumedModuleMapFile, /*IsModuleMap*/ true); if (EndOfLineMarker.isValid()) Offset = CI.getSourceManager().getDecomposedLoc(EndOfLineMarker).second; } // Load the module map file. if (HS.loadModuleMapFile(*ModuleMap, IsSystem, ModuleMapID, &Offset, PresumedModuleMapFile)) return true; if (SrcMgr.getBufferOrFake(ModuleMapID).getBufferSize() == Offset) Offset = 0; // Infer framework module if possible. if (HS.getModuleMap().canInferFrameworkModule(ModuleMap->getDir())) { SmallString<128> InferredFrameworkPath = ModuleMap->getDir().getName(); llvm::sys::path::append(InferredFrameworkPath, CI.getLangOpts().ModuleName + ".framework"); if (auto Dir = CI.getFileManager().getOptionalDirectoryRef(InferredFrameworkPath)) (void)HS.getModuleMap().inferFrameworkModule(*Dir, IsSystem, nullptr); } return false; } static Module *prepareToBuildModule(CompilerInstance &CI, StringRef ModuleMapFilename) { if (CI.getLangOpts().CurrentModule.empty()) { CI.getDiagnostics().Report(diag::err_missing_module_name); // FIXME: Eventually, we could consider asking whether there was just // a single module described in the module map, and use that as a // default. Then it would be fairly trivial to just "compile" a module // map with a single module (the common case). return nullptr; } // Dig out the module definition. HeaderSearch &HS = CI.getPreprocessor().getHeaderSearchInfo(); Module *M = HS.lookupModule(CI.getLangOpts().CurrentModule, SourceLocation(), /*AllowSearch=*/true); if (!M) { CI.getDiagnostics().Report(diag::err_missing_module) << CI.getLangOpts().CurrentModule << ModuleMapFilename; return nullptr; } // Check whether we can build this module at all. if (Preprocessor::checkModuleIsAvailable(CI.getLangOpts(), CI.getTarget(), CI.getDiagnostics(), M)) return nullptr; // Inform the preprocessor that includes from within the input buffer should // be resolved relative to the build directory of the module map file. CI.getPreprocessor().setMainFileDir(*M->Directory); // If the module was inferred from a different module map (via an expanded // umbrella module definition), track that fact. // FIXME: It would be preferable to fill this in as part of processing // the module map, rather than adding it after the fact. StringRef OriginalModuleMapName = CI.getFrontendOpts().OriginalModuleMap; if (!OriginalModuleMapName.empty()) { auto OriginalModuleMap = CI.getFileManager().getFile(OriginalModuleMapName, /*openFile*/ true); if (!OriginalModuleMap) { CI.getDiagnostics().Report(diag::err_module_map_not_found) << OriginalModuleMapName; return nullptr; } if (*OriginalModuleMap != CI.getSourceManager().getFileEntryForID( CI.getSourceManager().getMainFileID())) { M->IsInferred = true; CI.getPreprocessor().getHeaderSearchInfo().getModuleMap() .setInferredModuleAllowedBy(M, *OriginalModuleMap); } } // If we're being run from the command-line, the module build stack will not // have been filled in yet, so complete it now in order to allow us to detect // module cycles. SourceManager &SourceMgr = CI.getSourceManager(); if (SourceMgr.getModuleBuildStack().empty()) SourceMgr.pushModuleBuildStack(CI.getLangOpts().CurrentModule, FullSourceLoc(SourceLocation(), SourceMgr)); return M; } /// Compute the input buffer that should be used to build the specified module. static std::unique_ptr getInputBufferForModule(CompilerInstance &CI, Module *M) { FileManager &FileMgr = CI.getFileManager(); // Collect the set of #includes we need to build the module. SmallString<256> HeaderContents; std::error_code Err = std::error_code(); if (std::optional UmbrellaHeader = M->getUmbrellaHeaderAsWritten()) addHeaderInclude(UmbrellaHeader->PathRelativeToRootModuleDirectory, HeaderContents, CI.getLangOpts(), M->IsExternC); Err = collectModuleHeaderIncludes( CI.getLangOpts(), FileMgr, CI.getDiagnostics(), CI.getPreprocessor().getHeaderSearchInfo().getModuleMap(), M, HeaderContents); if (Err) { CI.getDiagnostics().Report(diag::err_module_cannot_create_includes) << M->getFullModuleName() << Err.message(); return nullptr; } return llvm::MemoryBuffer::getMemBufferCopy( HeaderContents, Module::getModuleInputBufferName()); } bool FrontendAction::BeginSourceFile(CompilerInstance &CI, const FrontendInputFile &RealInput) { FrontendInputFile Input(RealInput); assert(!Instance && "Already processing a source file!"); assert(!Input.isEmpty() && "Unexpected empty filename!"); setCurrentInput(Input); setCompilerInstance(&CI); bool HasBegunSourceFile = false; bool ReplayASTFile = Input.getKind().getFormat() == InputKind::Precompiled && usesPreprocessorOnly(); // If we fail, reset state since the client will not end up calling the // matching EndSourceFile(). All paths that return true should release this. auto FailureCleanup = llvm::make_scope_exit([&]() { if (HasBegunSourceFile) CI.getDiagnosticClient().EndSourceFile(); CI.setASTConsumer(nullptr); CI.clearOutputFiles(/*EraseFiles=*/true); CI.getLangOpts().setCompilingModule(LangOptions::CMK_None); setCurrentInput(FrontendInputFile()); setCompilerInstance(nullptr); }); if (!BeginInvocation(CI)) return false; // If we're replaying the build of an AST file, import it and set up // the initial state from its build. if (ReplayASTFile) { IntrusiveRefCntPtr Diags(&CI.getDiagnostics()); // The AST unit populates its own diagnostics engine rather than ours. IntrusiveRefCntPtr ASTDiags( new DiagnosticsEngine(Diags->getDiagnosticIDs(), &Diags->getDiagnosticOptions())); ASTDiags->setClient(Diags->getClient(), /*OwnsClient*/false); // FIXME: What if the input is a memory buffer? StringRef InputFile = Input.getFile(); std::unique_ptr AST = ASTUnit::LoadFromASTFile( std::string(InputFile), CI.getPCHContainerReader(), ASTUnit::LoadPreprocessorOnly, ASTDiags, CI.getFileSystemOpts(), /*HeaderSearchOptions=*/nullptr, CI.getCodeGenOpts().DebugTypeExtRefs); if (!AST) return false; // Options relating to how we treat the input (but not what we do with it) // are inherited from the AST unit. CI.getHeaderSearchOpts() = AST->getHeaderSearchOpts(); CI.getPreprocessorOpts() = AST->getPreprocessorOpts(); CI.getLangOpts() = AST->getLangOpts(); // Set the shared objects, these are reset when we finish processing the // file, otherwise the CompilerInstance will happily destroy them. CI.setFileManager(&AST->getFileManager()); CI.createSourceManager(CI.getFileManager()); CI.getSourceManager().initializeForReplay(AST->getSourceManager()); // Preload all the module files loaded transitively by the AST unit. Also // load all module map files that were parsed as part of building the AST // unit. if (auto ASTReader = AST->getASTReader()) { auto &MM = ASTReader->getModuleManager(); auto &PrimaryModule = MM.getPrimaryModule(); for (serialization::ModuleFile &MF : MM) if (&MF != &PrimaryModule) CI.getFrontendOpts().ModuleFiles.push_back(MF.FileName); ASTReader->visitTopLevelModuleMaps(PrimaryModule, [&](FileEntryRef FE) { CI.getFrontendOpts().ModuleMapFiles.push_back( std::string(FE.getName())); }); } // Set up the input file for replay purposes. auto Kind = AST->getInputKind(); if (Kind.getFormat() == InputKind::ModuleMap) { Module *ASTModule = AST->getPreprocessor().getHeaderSearchInfo().lookupModule( AST->getLangOpts().CurrentModule, SourceLocation(), /*AllowSearch*/ false); assert(ASTModule && "module file does not define its own module"); Input = FrontendInputFile(ASTModule->PresumedModuleMapFile, Kind); } else { auto &OldSM = AST->getSourceManager(); FileID ID = OldSM.getMainFileID(); if (auto *File = OldSM.getFileEntryForID(ID)) Input = FrontendInputFile(File->getName(), Kind); else Input = FrontendInputFile(OldSM.getBufferOrFake(ID), Kind); } setCurrentInput(Input, std::move(AST)); } // AST files follow a very different path, since they share objects via the // AST unit. if (Input.getKind().getFormat() == InputKind::Precompiled) { assert(!usesPreprocessorOnly() && "this case was handled above"); assert(hasASTFileSupport() && "This action does not have AST file support!"); IntrusiveRefCntPtr Diags(&CI.getDiagnostics()); // FIXME: What if the input is a memory buffer? StringRef InputFile = Input.getFile(); std::unique_ptr AST = ASTUnit::LoadFromASTFile( std::string(InputFile), CI.getPCHContainerReader(), ASTUnit::LoadEverything, Diags, CI.getFileSystemOpts(), CI.getHeaderSearchOptsPtr(), CI.getCodeGenOpts().DebugTypeExtRefs); if (!AST) return false; // Inform the diagnostic client we are processing a source file. CI.getDiagnosticClient().BeginSourceFile(CI.getLangOpts(), nullptr); HasBegunSourceFile = true; // Set the shared objects, these are reset when we finish processing the // file, otherwise the CompilerInstance will happily destroy them. CI.setFileManager(&AST->getFileManager()); CI.setSourceManager(&AST->getSourceManager()); CI.setPreprocessor(AST->getPreprocessorPtr()); Preprocessor &PP = CI.getPreprocessor(); PP.getBuiltinInfo().initializeBuiltins(PP.getIdentifierTable(), PP.getLangOpts()); CI.setASTContext(&AST->getASTContext()); setCurrentInput(Input, std::move(AST)); // Initialize the action. if (!BeginSourceFileAction(CI)) return false; // Create the AST consumer. CI.setASTConsumer(CreateWrappedASTConsumer(CI, InputFile)); if (!CI.hasASTConsumer()) return false; FailureCleanup.release(); return true; } // Set up the file and source managers, if needed. if (!CI.hasFileManager()) { if (!CI.createFileManager()) { return false; } } if (!CI.hasSourceManager()) { CI.createSourceManager(CI.getFileManager()); if (CI.getDiagnosticOpts().getFormat() == DiagnosticOptions::SARIF) { static_cast(&CI.getDiagnosticClient()) ->setSarifWriter( std::make_unique(CI.getSourceManager())); } } // Set up embedding for any specified files. Do this before we load any // source files, including the primary module map for the compilation. for (const auto &F : CI.getFrontendOpts().ModulesEmbedFiles) { if (auto FE = CI.getFileManager().getFile(F, /*openFile*/true)) CI.getSourceManager().setFileIsTransient(*FE); else CI.getDiagnostics().Report(diag::err_modules_embed_file_not_found) << F; } if (CI.getFrontendOpts().ModulesEmbedAllFiles) CI.getSourceManager().setAllFilesAreTransient(true); // IR files bypass the rest of initialization. if (Input.getKind().getLanguage() == Language::LLVM_IR) { assert(hasIRSupport() && "This action does not have IR file support!"); // Inform the diagnostic client we are processing a source file. CI.getDiagnosticClient().BeginSourceFile(CI.getLangOpts(), nullptr); HasBegunSourceFile = true; // Initialize the action. if (!BeginSourceFileAction(CI)) return false; // Initialize the main file entry. if (!CI.InitializeSourceManager(CurrentInput)) return false; FailureCleanup.release(); return true; } // If the implicit PCH include is actually a directory, rather than // a single file, search for a suitable PCH file in that directory. if (!CI.getPreprocessorOpts().ImplicitPCHInclude.empty()) { FileManager &FileMgr = CI.getFileManager(); PreprocessorOptions &PPOpts = CI.getPreprocessorOpts(); StringRef PCHInclude = PPOpts.ImplicitPCHInclude; std::string SpecificModuleCachePath = CI.getSpecificModuleCachePath(); if (auto PCHDir = FileMgr.getOptionalDirectoryRef(PCHInclude)) { std::error_code EC; SmallString<128> DirNative; llvm::sys::path::native(PCHDir->getName(), DirNative); bool Found = false; llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); for (llvm::vfs::directory_iterator Dir = FS.dir_begin(DirNative, EC), DirEnd; Dir != DirEnd && !EC; Dir.increment(EC)) { // Check whether this is an acceptable AST file. if (ASTReader::isAcceptableASTFile( Dir->path(), FileMgr, CI.getModuleCache(), CI.getPCHContainerReader(), CI.getLangOpts(), CI.getTargetOpts(), CI.getPreprocessorOpts(), SpecificModuleCachePath, /*RequireStrictOptionMatches=*/true)) { PPOpts.ImplicitPCHInclude = std::string(Dir->path()); Found = true; break; } } if (!Found) { CI.getDiagnostics().Report(diag::err_fe_no_pch_in_dir) << PCHInclude; return false; } } } // Set up the preprocessor if needed. When parsing model files the // preprocessor of the original source is reused. if (!isModelParsingAction()) CI.createPreprocessor(getTranslationUnitKind()); // Inform the diagnostic client we are processing a source file. CI.getDiagnosticClient().BeginSourceFile(CI.getLangOpts(), &CI.getPreprocessor()); HasBegunSourceFile = true; // Handle C++20 header units. // Here, the user has the option to specify that the header name should be // looked up in the pre-processor search paths (and the main filename as // passed by the driver might therefore be incomplete until that look-up). if (CI.getLangOpts().CPlusPlusModules && Input.getKind().isHeaderUnit() && !Input.getKind().isPreprocessed()) { StringRef FileName = Input.getFile(); InputKind Kind = Input.getKind(); if (Kind.getHeaderUnitKind() != InputKind::HeaderUnit_Abs) { assert(CI.hasPreprocessor() && "trying to build a header unit without a Pre-processor?"); HeaderSearch &HS = CI.getPreprocessor().getHeaderSearchInfo(); // Relative searches begin from CWD. auto Dir = CI.getFileManager().getOptionalDirectoryRef("."); SmallVector, 1> CWD; CWD.push_back({nullptr, *Dir}); OptionalFileEntryRef FE = HS.LookupFile(FileName, SourceLocation(), /*Angled*/ Input.getKind().getHeaderUnitKind() == InputKind::HeaderUnit_System, nullptr, nullptr, CWD, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); if (!FE) { CI.getDiagnostics().Report(diag::err_module_header_file_not_found) << FileName; return false; } // We now have the filename... FileName = FE->getFileEntry().getName(); // ... still a header unit, but now use the path as written. Kind = Input.getKind().withHeaderUnit(InputKind::HeaderUnit_Abs); Input = FrontendInputFile(FileName, Kind, Input.isSystem()); } // Unless the user has overridden the name, the header unit module name is // the pathname for the file. if (CI.getLangOpts().ModuleName.empty()) CI.getLangOpts().ModuleName = std::string(FileName); CI.getLangOpts().CurrentModule = CI.getLangOpts().ModuleName; } if (!CI.InitializeSourceManager(Input)) return false; if (CI.getLangOpts().CPlusPlusModules && Input.getKind().isHeaderUnit() && Input.getKind().isPreprocessed() && !usesPreprocessorOnly()) { // We have an input filename like foo.iih, but we want to find the right // module name (and original file, to build the map entry). // Check if the first line specifies the original source file name with a // linemarker. std::string PresumedInputFile = std::string(getCurrentFileOrBufferName()); ReadOriginalFileName(CI, PresumedInputFile); // Unless the user overrides this, the module name is the name by which the // original file was known. if (CI.getLangOpts().ModuleName.empty()) CI.getLangOpts().ModuleName = std::string(PresumedInputFile); CI.getLangOpts().CurrentModule = CI.getLangOpts().ModuleName; } // For module map files, we first parse the module map and synthesize a // "" buffer before more conventional processing. if (Input.getKind().getFormat() == InputKind::ModuleMap) { CI.getLangOpts().setCompilingModule(LangOptions::CMK_ModuleMap); std::string PresumedModuleMapFile; unsigned OffsetToContents; if (loadModuleMapForModuleBuild(CI, Input.isSystem(), Input.isPreprocessed(), PresumedModuleMapFile, OffsetToContents)) return false; auto *CurrentModule = prepareToBuildModule(CI, Input.getFile()); if (!CurrentModule) return false; CurrentModule->PresumedModuleMapFile = PresumedModuleMapFile; if (OffsetToContents) // If the module contents are in the same file, skip to them. CI.getPreprocessor().setSkipMainFilePreamble(OffsetToContents, true); else { // Otherwise, convert the module description to a suitable input buffer. auto Buffer = getInputBufferForModule(CI, CurrentModule); if (!Buffer) return false; // Reinitialize the main file entry to refer to the new input. auto Kind = CurrentModule->IsSystem ? SrcMgr::C_System : SrcMgr::C_User; auto &SourceMgr = CI.getSourceManager(); auto BufferID = SourceMgr.createFileID(std::move(Buffer), Kind); assert(BufferID.isValid() && "couldn't create module buffer ID"); SourceMgr.setMainFileID(BufferID); } } // Initialize the action. if (!BeginSourceFileAction(CI)) return false; // If we were asked to load any module map files, do so now. for (const auto &Filename : CI.getFrontendOpts().ModuleMapFiles) { if (auto File = CI.getFileManager().getOptionalFileRef(Filename)) CI.getPreprocessor().getHeaderSearchInfo().loadModuleMapFile( *File, /*IsSystem*/false); else CI.getDiagnostics().Report(diag::err_module_map_not_found) << Filename; } // If compiling implementation of a module, load its module map file now. (void)CI.getPreprocessor().getCurrentModuleImplementation(); // Add a module declaration scope so that modules from -fmodule-map-file // arguments may shadow modules found implicitly in search paths. CI.getPreprocessor() .getHeaderSearchInfo() .getModuleMap() .finishModuleDeclarationScope(); // Create the AST context and consumer unless this is a preprocessor only // action. if (!usesPreprocessorOnly()) { // Parsing a model file should reuse the existing ASTContext. if (!isModelParsingAction()) CI.createASTContext(); // For preprocessed files, check if the first line specifies the original // source file name with a linemarker. std::string PresumedInputFile = std::string(getCurrentFileOrBufferName()); if (Input.isPreprocessed()) ReadOriginalFileName(CI, PresumedInputFile); std::unique_ptr Consumer = CreateWrappedASTConsumer(CI, PresumedInputFile); if (!Consumer) return false; // FIXME: should not overwrite ASTMutationListener when parsing model files? if (!isModelParsingAction()) CI.getASTContext().setASTMutationListener(Consumer->GetASTMutationListener()); if (!CI.getPreprocessorOpts().ChainedIncludes.empty()) { // Convert headers to PCH and chain them. IntrusiveRefCntPtr source, FinalReader; source = createChainedIncludesSource(CI, FinalReader); if (!source) return false; CI.setASTReader(static_cast(FinalReader.get())); CI.getASTContext().setExternalSource(source); } else if (CI.getLangOpts().Modules || !CI.getPreprocessorOpts().ImplicitPCHInclude.empty()) { // Use PCM or PCH. assert(hasPCHSupport() && "This action does not have PCH support!"); ASTDeserializationListener *DeserialListener = Consumer->GetASTDeserializationListener(); bool DeleteDeserialListener = false; if (CI.getPreprocessorOpts().DumpDeserializedPCHDecls) { DeserialListener = new DeserializedDeclsDumper(DeserialListener, DeleteDeserialListener); DeleteDeserialListener = true; } if (!CI.getPreprocessorOpts().DeserializedPCHDeclsToErrorOn.empty()) { DeserialListener = new DeserializedDeclsChecker( CI.getASTContext(), CI.getPreprocessorOpts().DeserializedPCHDeclsToErrorOn, DeserialListener, DeleteDeserialListener); DeleteDeserialListener = true; } if (!CI.getPreprocessorOpts().ImplicitPCHInclude.empty()) { CI.createPCHExternalASTSource( CI.getPreprocessorOpts().ImplicitPCHInclude, CI.getPreprocessorOpts().DisablePCHOrModuleValidation, CI.getPreprocessorOpts().AllowPCHWithCompilerErrors, DeserialListener, DeleteDeserialListener); if (!CI.getASTContext().getExternalSource()) return false; } // If modules are enabled, create the AST reader before creating // any builtins, so that all declarations know that they might be // extended by an external source. if (CI.getLangOpts().Modules || !CI.hasASTContext() || !CI.getASTContext().getExternalSource()) { CI.createASTReader(); CI.getASTReader()->setDeserializationListener(DeserialListener, DeleteDeserialListener); } } CI.setASTConsumer(std::move(Consumer)); if (!CI.hasASTConsumer()) return false; } // Initialize built-in info as long as we aren't using an external AST // source. if (CI.getLangOpts().Modules || !CI.hasASTContext() || !CI.getASTContext().getExternalSource()) { Preprocessor &PP = CI.getPreprocessor(); PP.getBuiltinInfo().initializeBuiltins(PP.getIdentifierTable(), PP.getLangOpts()); } else { // FIXME: If this is a problem, recover from it by creating a multiplex // source. assert((!CI.getLangOpts().Modules || CI.getASTReader()) && "modules enabled but created an external source that " "doesn't support modules"); } // If we were asked to load any module files, do so now. for (const auto &ModuleFile : CI.getFrontendOpts().ModuleFiles) if (!CI.loadModuleFile(ModuleFile)) return false; // If there is a layout overrides file, attach an external AST source that // provides the layouts from that file. if (!CI.getFrontendOpts().OverrideRecordLayoutsFile.empty() && CI.hasASTContext() && !CI.getASTContext().getExternalSource()) { IntrusiveRefCntPtr Override(new LayoutOverrideSource( CI.getFrontendOpts().OverrideRecordLayoutsFile)); CI.getASTContext().setExternalSource(Override); } // Setup HLSL External Sema Source if (CI.getLangOpts().HLSL && CI.hasASTContext()) { IntrusiveRefCntPtr HLSLSema( new HLSLExternalSemaSource()); if (auto *SemaSource = dyn_cast_if_present( CI.getASTContext().getExternalSource())) { IntrusiveRefCntPtr MultiSema( new MultiplexExternalSemaSource(SemaSource, HLSLSema.get())); CI.getASTContext().setExternalSource(MultiSema); } else CI.getASTContext().setExternalSource(HLSLSema); } FailureCleanup.release(); return true; } llvm::Error FrontendAction::Execute() { CompilerInstance &CI = getCompilerInstance(); if (CI.hasFrontendTimer()) { llvm::TimeRegion Timer(CI.getFrontendTimer()); ExecuteAction(); } else ExecuteAction(); // If we are supposed to rebuild the global module index, do so now unless // there were any module-build failures. if (CI.shouldBuildGlobalModuleIndex() && CI.hasFileManager() && CI.hasPreprocessor()) { StringRef Cache = CI.getPreprocessor().getHeaderSearchInfo().getModuleCachePath(); if (!Cache.empty()) { if (llvm::Error Err = GlobalModuleIndex::writeIndex( CI.getFileManager(), CI.getPCHContainerReader(), Cache)) { // FIXME this drops the error on the floor, but // Index/pch-from-libclang.c seems to rely on dropping at least some of // the error conditions! consumeError(std::move(Err)); } } } return llvm::Error::success(); } void FrontendAction::EndSourceFile() { CompilerInstance &CI = getCompilerInstance(); // Inform the diagnostic client we are done with this source file. CI.getDiagnosticClient().EndSourceFile(); // Inform the preprocessor we are done. if (CI.hasPreprocessor()) CI.getPreprocessor().EndSourceFile(); // Finalize the action. EndSourceFileAction(); // Sema references the ast consumer, so reset sema first. // // FIXME: There is more per-file stuff we could just drop here? bool DisableFree = CI.getFrontendOpts().DisableFree; if (DisableFree) { CI.resetAndLeakSema(); CI.resetAndLeakASTContext(); llvm::BuryPointer(CI.takeASTConsumer().get()); } else { CI.setSema(nullptr); CI.setASTContext(nullptr); CI.setASTConsumer(nullptr); } if (CI.getFrontendOpts().ShowStats) { llvm::errs() << "\nSTATISTICS FOR '" << getCurrentFileOrBufferName() << "':\n"; CI.getPreprocessor().PrintStats(); CI.getPreprocessor().getIdentifierTable().PrintStats(); CI.getPreprocessor().getHeaderSearchInfo().PrintStats(); CI.getSourceManager().PrintStats(); llvm::errs() << "\n"; } // Cleanup the output streams, and erase the output files if instructed by the // FrontendAction. CI.clearOutputFiles(/*EraseFiles=*/shouldEraseOutputFiles()); // The resources are owned by AST when the current file is AST. // So we reset the resources here to avoid users accessing it // accidently. if (isCurrentFileAST()) { if (DisableFree) { CI.resetAndLeakPreprocessor(); CI.resetAndLeakSourceManager(); CI.resetAndLeakFileManager(); llvm::BuryPointer(std::move(CurrentASTUnit)); } else { CI.setPreprocessor(nullptr); CI.setSourceManager(nullptr); CI.setFileManager(nullptr); } } setCompilerInstance(nullptr); setCurrentInput(FrontendInputFile()); CI.getLangOpts().setCompilingModule(LangOptions::CMK_None); } bool FrontendAction::shouldEraseOutputFiles() { return getCompilerInstance().getDiagnostics().hasErrorOccurred(); } //===----------------------------------------------------------------------===// // Utility Actions //===----------------------------------------------------------------------===// void ASTFrontendAction::ExecuteAction() { CompilerInstance &CI = getCompilerInstance(); if (!CI.hasPreprocessor()) return; + // This is a fallback: If the client forgets to invoke this, we mark the + // current stack as the bottom. Though not optimal, this could help prevent + // stack overflow during deep recursion. + clang::noteBottomOfStack(); // FIXME: Move the truncation aspect of this into Sema, we delayed this till // here so the source manager would be initialized. if (hasCodeCompletionSupport() && !CI.getFrontendOpts().CodeCompletionAt.FileName.empty()) CI.createCodeCompletionConsumer(); // Use a code completion consumer? CodeCompleteConsumer *CompletionConsumer = nullptr; if (CI.hasCodeCompletionConsumer()) CompletionConsumer = &CI.getCodeCompletionConsumer(); if (!CI.hasSema()) CI.createSema(getTranslationUnitKind(), CompletionConsumer); ParseAST(CI.getSema(), CI.getFrontendOpts().ShowStats, CI.getFrontendOpts().SkipFunctionBodies); } void PluginASTAction::anchor() { } std::unique_ptr PreprocessorFrontendAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { llvm_unreachable("Invalid CreateASTConsumer on preprocessor action!"); } bool WrapperFrontendAction::PrepareToExecuteAction(CompilerInstance &CI) { return WrappedAction->PrepareToExecuteAction(CI); } std::unique_ptr WrapperFrontendAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { return WrappedAction->CreateASTConsumer(CI, InFile); } bool WrapperFrontendAction::BeginInvocation(CompilerInstance &CI) { return WrappedAction->BeginInvocation(CI); } bool WrapperFrontendAction::BeginSourceFileAction(CompilerInstance &CI) { WrappedAction->setCurrentInput(getCurrentInput()); WrappedAction->setCompilerInstance(&CI); auto Ret = WrappedAction->BeginSourceFileAction(CI); // BeginSourceFileAction may change CurrentInput, e.g. during module builds. setCurrentInput(WrappedAction->getCurrentInput()); return Ret; } void WrapperFrontendAction::ExecuteAction() { WrappedAction->ExecuteAction(); } void WrapperFrontendAction::EndSourceFile() { WrappedAction->EndSourceFile(); } void WrapperFrontendAction::EndSourceFileAction() { WrappedAction->EndSourceFileAction(); } bool WrapperFrontendAction::shouldEraseOutputFiles() { return WrappedAction->shouldEraseOutputFiles(); } bool WrapperFrontendAction::usesPreprocessorOnly() const { return WrappedAction->usesPreprocessorOnly(); } TranslationUnitKind WrapperFrontendAction::getTranslationUnitKind() { return WrappedAction->getTranslationUnitKind(); } bool WrapperFrontendAction::hasPCHSupport() const { return WrappedAction->hasPCHSupport(); } bool WrapperFrontendAction::hasASTFileSupport() const { return WrappedAction->hasASTFileSupport(); } bool WrapperFrontendAction::hasIRSupport() const { return WrappedAction->hasIRSupport(); } bool WrapperFrontendAction::hasCodeCompletionSupport() const { return WrappedAction->hasCodeCompletionSupport(); } WrapperFrontendAction::WrapperFrontendAction( std::unique_ptr WrappedAction) : WrappedAction(std::move(WrappedAction)) {} diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 423d5372a6f6..1cff4a75790e 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -1,9182 +1,9191 @@ //===--- SemaExprCXX.cpp - Semantic Analysis for Expressions --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// Implements semantic analysis for C++ expressions. /// //===----------------------------------------------------------------------===// #include "TreeTransform.h" #include "TypeLocBuilder.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/CXXInheritance.h" #include "clang/AST/CharUnits.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/ExprCXX.h" +#include "clang/AST/ExprConcepts.h" #include "clang/AST/ExprObjC.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Type.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/AlignedAllocation.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Basic/PartialDiagnostic.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/DeclSpec.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaInternal.h" #include "clang/Sema/SemaLambda.h" #include "clang/Sema/Template.h" #include "clang/Sema/TemplateDeduction.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TypeSize.h" #include using namespace clang; using namespace sema; /// Handle the result of the special case name lookup for inheriting /// constructor declarations. 'NS::X::X' and 'NS::X<...>::X' are treated as /// constructor names in member using declarations, even if 'X' is not the /// name of the corresponding type. ParsedType Sema::getInheritingConstructorName(CXXScopeSpec &SS, SourceLocation NameLoc, IdentifierInfo &Name) { NestedNameSpecifier *NNS = SS.getScopeRep(); // Convert the nested-name-specifier into a type. QualType Type; switch (NNS->getKind()) { case NestedNameSpecifier::TypeSpec: case NestedNameSpecifier::TypeSpecWithTemplate: Type = QualType(NNS->getAsType(), 0); break; case NestedNameSpecifier::Identifier: // Strip off the last layer of the nested-name-specifier and build a // typename type for it. assert(NNS->getAsIdentifier() == &Name && "not a constructor name"); Type = Context.getDependentNameType(ETK_None, NNS->getPrefix(), NNS->getAsIdentifier()); break; case NestedNameSpecifier::Global: case NestedNameSpecifier::Super: case NestedNameSpecifier::Namespace: case NestedNameSpecifier::NamespaceAlias: llvm_unreachable("Nested name specifier is not a type for inheriting ctor"); } // This reference to the type is located entirely at the location of the // final identifier in the qualified-id. return CreateParsedType(Type, Context.getTrivialTypeSourceInfo(Type, NameLoc)); } ParsedType Sema::getConstructorName(IdentifierInfo &II, SourceLocation NameLoc, Scope *S, CXXScopeSpec &SS, bool EnteringContext) { CXXRecordDecl *CurClass = getCurrentClass(S, &SS); assert(CurClass && &II == CurClass->getIdentifier() && "not a constructor name"); // When naming a constructor as a member of a dependent context (eg, in a // friend declaration or an inherited constructor declaration), form an // unresolved "typename" type. if (CurClass->isDependentContext() && !EnteringContext && SS.getScopeRep()) { QualType T = Context.getDependentNameType(ETK_None, SS.getScopeRep(), &II); return ParsedType::make(T); } if (SS.isNotEmpty() && RequireCompleteDeclContext(SS, CurClass)) return ParsedType(); // Find the injected-class-name declaration. Note that we make no attempt to // diagnose cases where the injected-class-name is shadowed: the only // declaration that can validly shadow the injected-class-name is a // non-static data member, and if the class contains both a non-static data // member and a constructor then it is ill-formed (we check that in // CheckCompletedCXXClass). CXXRecordDecl *InjectedClassName = nullptr; for (NamedDecl *ND : CurClass->lookup(&II)) { auto *RD = dyn_cast(ND); if (RD && RD->isInjectedClassName()) { InjectedClassName = RD; break; } } if (!InjectedClassName) { if (!CurClass->isInvalidDecl()) { // FIXME: RequireCompleteDeclContext doesn't check dependent contexts // properly. Work around it here for now. Diag(SS.getLastQualifierNameLoc(), diag::err_incomplete_nested_name_spec) << CurClass << SS.getRange(); } return ParsedType(); } QualType T = Context.getTypeDeclType(InjectedClassName); DiagnoseUseOfDecl(InjectedClassName, NameLoc); MarkAnyDeclReferenced(NameLoc, InjectedClassName, /*OdrUse=*/false); return ParsedType::make(T); } ParsedType Sema::getDestructorName(SourceLocation TildeLoc, IdentifierInfo &II, SourceLocation NameLoc, Scope *S, CXXScopeSpec &SS, ParsedType ObjectTypePtr, bool EnteringContext) { // Determine where to perform name lookup. // FIXME: This area of the standard is very messy, and the current // wording is rather unclear about which scopes we search for the // destructor name; see core issues 399 and 555. Issue 399 in // particular shows where the current description of destructor name // lookup is completely out of line with existing practice, e.g., // this appears to be ill-formed: // // namespace N { // template struct S { // ~S(); // }; // } // // void f(N::S* s) { // s->N::S::~S(); // } // // See also PR6358 and PR6359. // // For now, we accept all the cases in which the name given could plausibly // be interpreted as a correct destructor name, issuing off-by-default // extension diagnostics on the cases that don't strictly conform to the // C++20 rules. This basically means we always consider looking in the // nested-name-specifier prefix, the complete nested-name-specifier, and // the scope, and accept if we find the expected type in any of the three // places. if (SS.isInvalid()) return nullptr; // Whether we've failed with a diagnostic already. bool Failed = false; llvm::SmallVector FoundDecls; llvm::SmallPtrSet, 8> FoundDeclSet; // If we have an object type, it's because we are in a // pseudo-destructor-expression or a member access expression, and // we know what type we're looking for. QualType SearchType = ObjectTypePtr ? GetTypeFromParser(ObjectTypePtr) : QualType(); auto CheckLookupResult = [&](LookupResult &Found) -> ParsedType { auto IsAcceptableResult = [&](NamedDecl *D) -> bool { auto *Type = dyn_cast(D->getUnderlyingDecl()); if (!Type) return false; if (SearchType.isNull() || SearchType->isDependentType()) return true; QualType T = Context.getTypeDeclType(Type); return Context.hasSameUnqualifiedType(T, SearchType); }; unsigned NumAcceptableResults = 0; for (NamedDecl *D : Found) { if (IsAcceptableResult(D)) ++NumAcceptableResults; // Don't list a class twice in the lookup failure diagnostic if it's // found by both its injected-class-name and by the name in the enclosing // scope. if (auto *RD = dyn_cast(D)) if (RD->isInjectedClassName()) D = cast(RD->getParent()); if (FoundDeclSet.insert(D).second) FoundDecls.push_back(D); } // As an extension, attempt to "fix" an ambiguity by erasing all non-type // results, and all non-matching results if we have a search type. It's not // clear what the right behavior is if destructor lookup hits an ambiguity, // but other compilers do generally accept at least some kinds of // ambiguity. if (Found.isAmbiguous() && NumAcceptableResults == 1) { Diag(NameLoc, diag::ext_dtor_name_ambiguous); LookupResult::Filter F = Found.makeFilter(); while (F.hasNext()) { NamedDecl *D = F.next(); if (auto *TD = dyn_cast(D->getUnderlyingDecl())) Diag(D->getLocation(), diag::note_destructor_type_here) << Context.getTypeDeclType(TD); else Diag(D->getLocation(), diag::note_destructor_nontype_here); if (!IsAcceptableResult(D)) F.erase(); } F.done(); } if (Found.isAmbiguous()) Failed = true; if (TypeDecl *Type = Found.getAsSingle()) { if (IsAcceptableResult(Type)) { QualType T = Context.getTypeDeclType(Type); MarkAnyDeclReferenced(Type->getLocation(), Type, /*OdrUse=*/false); return CreateParsedType(Context.getElaboratedType(ETK_None, nullptr, T), Context.getTrivialTypeSourceInfo(T, NameLoc)); } } return nullptr; }; bool IsDependent = false; auto LookupInObjectType = [&]() -> ParsedType { if (Failed || SearchType.isNull()) return nullptr; IsDependent |= SearchType->isDependentType(); LookupResult Found(*this, &II, NameLoc, LookupDestructorName); DeclContext *LookupCtx = computeDeclContext(SearchType); if (!LookupCtx) return nullptr; LookupQualifiedName(Found, LookupCtx); return CheckLookupResult(Found); }; auto LookupInNestedNameSpec = [&](CXXScopeSpec &LookupSS) -> ParsedType { if (Failed) return nullptr; IsDependent |= isDependentScopeSpecifier(LookupSS); DeclContext *LookupCtx = computeDeclContext(LookupSS, EnteringContext); if (!LookupCtx) return nullptr; LookupResult Found(*this, &II, NameLoc, LookupDestructorName); if (RequireCompleteDeclContext(LookupSS, LookupCtx)) { Failed = true; return nullptr; } LookupQualifiedName(Found, LookupCtx); return CheckLookupResult(Found); }; auto LookupInScope = [&]() -> ParsedType { if (Failed || !S) return nullptr; LookupResult Found(*this, &II, NameLoc, LookupDestructorName); LookupName(Found, S); return CheckLookupResult(Found); }; // C++2a [basic.lookup.qual]p6: // In a qualified-id of the form // // nested-name-specifier[opt] type-name :: ~ type-name // // the second type-name is looked up in the same scope as the first. // // We interpret this as meaning that if you do a dual-scope lookup for the // first name, you also do a dual-scope lookup for the second name, per // C++ [basic.lookup.classref]p4: // // If the id-expression in a class member access is a qualified-id of the // form // // class-name-or-namespace-name :: ... // // the class-name-or-namespace-name following the . or -> is first looked // up in the class of the object expression and the name, if found, is used. // Otherwise, it is looked up in the context of the entire // postfix-expression. // // This looks in the same scopes as for an unqualified destructor name: // // C++ [basic.lookup.classref]p3: // If the unqualified-id is ~ type-name, the type-name is looked up // in the context of the entire postfix-expression. If the type T // of the object expression is of a class type C, the type-name is // also looked up in the scope of class C. At least one of the // lookups shall find a name that refers to cv T. // // FIXME: The intent is unclear here. Should type-name::~type-name look in // the scope anyway if it finds a non-matching name declared in the class? // If both lookups succeed and find a dependent result, which result should // we retain? (Same question for p->~type-name().) if (NestedNameSpecifier *Prefix = SS.isSet() ? SS.getScopeRep()->getPrefix() : nullptr) { // This is // // nested-name-specifier type-name :: ~ type-name // // Look for the second type-name in the nested-name-specifier. CXXScopeSpec PrefixSS; PrefixSS.Adopt(NestedNameSpecifierLoc(Prefix, SS.location_data())); if (ParsedType T = LookupInNestedNameSpec(PrefixSS)) return T; } else { // This is one of // // type-name :: ~ type-name // ~ type-name // // Look in the scope and (if any) the object type. if (ParsedType T = LookupInScope()) return T; if (ParsedType T = LookupInObjectType()) return T; } if (Failed) return nullptr; if (IsDependent) { // We didn't find our type, but that's OK: it's dependent anyway. // FIXME: What if we have no nested-name-specifier? QualType T = CheckTypenameType(ETK_None, SourceLocation(), SS.getWithLocInContext(Context), II, NameLoc); return ParsedType::make(T); } // The remaining cases are all non-standard extensions imitating the behavior // of various other compilers. unsigned NumNonExtensionDecls = FoundDecls.size(); if (SS.isSet()) { // For compatibility with older broken C++ rules and existing code, // // nested-name-specifier :: ~ type-name // // also looks for type-name within the nested-name-specifier. if (ParsedType T = LookupInNestedNameSpec(SS)) { Diag(SS.getEndLoc(), diag::ext_dtor_named_in_wrong_scope) << SS.getRange() << FixItHint::CreateInsertion(SS.getEndLoc(), ("::" + II.getName()).str()); return T; } // For compatibility with other compilers and older versions of Clang, // // nested-name-specifier type-name :: ~ type-name // // also looks for type-name in the scope. Unfortunately, we can't // reasonably apply this fallback for dependent nested-name-specifiers. if (SS.isValid() && SS.getScopeRep()->getPrefix()) { if (ParsedType T = LookupInScope()) { Diag(SS.getEndLoc(), diag::ext_qualified_dtor_named_in_lexical_scope) << FixItHint::CreateRemoval(SS.getRange()); Diag(FoundDecls.back()->getLocation(), diag::note_destructor_type_here) << GetTypeFromParser(T); return T; } } } // We didn't find anything matching; tell the user what we did find (if // anything). // Don't tell the user about declarations we shouldn't have found. FoundDecls.resize(NumNonExtensionDecls); // List types before non-types. std::stable_sort(FoundDecls.begin(), FoundDecls.end(), [](NamedDecl *A, NamedDecl *B) { return isa(A->getUnderlyingDecl()) > isa(B->getUnderlyingDecl()); }); // Suggest a fixit to properly name the destroyed type. auto MakeFixItHint = [&]{ const CXXRecordDecl *Destroyed = nullptr; // FIXME: If we have a scope specifier, suggest its last component? if (!SearchType.isNull()) Destroyed = SearchType->getAsCXXRecordDecl(); else if (S) Destroyed = dyn_cast_or_null(S->getEntity()); if (Destroyed) return FixItHint::CreateReplacement(SourceRange(NameLoc), Destroyed->getNameAsString()); return FixItHint(); }; if (FoundDecls.empty()) { // FIXME: Attempt typo-correction? Diag(NameLoc, diag::err_undeclared_destructor_name) << &II << MakeFixItHint(); } else if (!SearchType.isNull() && FoundDecls.size() == 1) { if (auto *TD = dyn_cast(FoundDecls[0]->getUnderlyingDecl())) { assert(!SearchType.isNull() && "should only reject a type result if we have a search type"); QualType T = Context.getTypeDeclType(TD); Diag(NameLoc, diag::err_destructor_expr_type_mismatch) << T << SearchType << MakeFixItHint(); } else { Diag(NameLoc, diag::err_destructor_expr_nontype) << &II << MakeFixItHint(); } } else { Diag(NameLoc, SearchType.isNull() ? diag::err_destructor_name_nontype : diag::err_destructor_expr_mismatch) << &II << SearchType << MakeFixItHint(); } for (NamedDecl *FoundD : FoundDecls) { if (auto *TD = dyn_cast(FoundD->getUnderlyingDecl())) Diag(FoundD->getLocation(), diag::note_destructor_type_here) << Context.getTypeDeclType(TD); else Diag(FoundD->getLocation(), diag::note_destructor_nontype_here) << FoundD; } return nullptr; } ParsedType Sema::getDestructorTypeForDecltype(const DeclSpec &DS, ParsedType ObjectType) { if (DS.getTypeSpecType() == DeclSpec::TST_error) return nullptr; if (DS.getTypeSpecType() == DeclSpec::TST_decltype_auto) { Diag(DS.getTypeSpecTypeLoc(), diag::err_decltype_auto_invalid); return nullptr; } assert(DS.getTypeSpecType() == DeclSpec::TST_decltype && "unexpected type in getDestructorType"); QualType T = BuildDecltypeType(DS.getRepAsExpr()); // If we know the type of the object, check that the correct destructor // type was named now; we can give better diagnostics this way. QualType SearchType = GetTypeFromParser(ObjectType); if (!SearchType.isNull() && !SearchType->isDependentType() && !Context.hasSameUnqualifiedType(T, SearchType)) { Diag(DS.getTypeSpecTypeLoc(), diag::err_destructor_expr_type_mismatch) << T << SearchType; return nullptr; } return ParsedType::make(T); } bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS, const UnqualifiedId &Name, bool IsUDSuffix) { assert(Name.getKind() == UnqualifiedIdKind::IK_LiteralOperatorId); if (!IsUDSuffix) { // [over.literal] p8 // // double operator""_Bq(long double); // OK: not a reserved identifier // double operator"" _Bq(long double); // ill-formed, no diagnostic required IdentifierInfo *II = Name.Identifier; ReservedIdentifierStatus Status = II->isReserved(PP.getLangOpts()); SourceLocation Loc = Name.getEndLoc(); if (!PP.getSourceManager().isInSystemHeader(Loc)) { if (auto Hint = FixItHint::CreateReplacement( Name.getSourceRange(), (StringRef("operator\"\"") + II->getName()).str()); isReservedInAllContexts(Status)) { Diag(Loc, diag::warn_reserved_extern_symbol) << II << static_cast(Status) << Hint; } else { Diag(Loc, diag::warn_deprecated_literal_operator_id) << II << Hint; } } } if (!SS.isValid()) return false; switch (SS.getScopeRep()->getKind()) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::TypeSpec: case NestedNameSpecifier::TypeSpecWithTemplate: // Per C++11 [over.literal]p2, literal operators can only be declared at // namespace scope. Therefore, this unqualified-id cannot name anything. // Reject it early, because we have no AST representation for this in the // case where the scope is dependent. Diag(Name.getBeginLoc(), diag::err_literal_operator_id_outside_namespace) << SS.getScopeRep(); return true; case NestedNameSpecifier::Global: case NestedNameSpecifier::Super: case NestedNameSpecifier::Namespace: case NestedNameSpecifier::NamespaceAlias: return false; } llvm_unreachable("unknown nested name specifier kind"); } /// Build a C++ typeid expression with a type operand. ExprResult Sema::BuildCXXTypeId(QualType TypeInfoType, SourceLocation TypeidLoc, TypeSourceInfo *Operand, SourceLocation RParenLoc) { // C++ [expr.typeid]p4: // The top-level cv-qualifiers of the lvalue expression or the type-id // that is the operand of typeid are always ignored. // If the type of the type-id is a class type or a reference to a class // type, the class shall be completely-defined. Qualifiers Quals; QualType T = Context.getUnqualifiedArrayType(Operand->getType().getNonReferenceType(), Quals); if (T->getAs() && RequireCompleteType(TypeidLoc, T, diag::err_incomplete_typeid)) return ExprError(); if (T->isVariablyModifiedType()) return ExprError(Diag(TypeidLoc, diag::err_variably_modified_typeid) << T); if (CheckQualifiedFunctionForTypeId(T, TypeidLoc)) return ExprError(); return new (Context) CXXTypeidExpr(TypeInfoType.withConst(), Operand, SourceRange(TypeidLoc, RParenLoc)); } /// Build a C++ typeid expression with an expression operand. ExprResult Sema::BuildCXXTypeId(QualType TypeInfoType, SourceLocation TypeidLoc, Expr *E, SourceLocation RParenLoc) { bool WasEvaluated = false; if (E && !E->isTypeDependent()) { if (E->hasPlaceholderType()) { ExprResult result = CheckPlaceholderExpr(E); if (result.isInvalid()) return ExprError(); E = result.get(); } QualType T = E->getType(); if (const RecordType *RecordT = T->getAs()) { CXXRecordDecl *RecordD = cast(RecordT->getDecl()); // C++ [expr.typeid]p3: // [...] If the type of the expression is a class type, the class // shall be completely-defined. if (RequireCompleteType(TypeidLoc, T, diag::err_incomplete_typeid)) return ExprError(); // C++ [expr.typeid]p3: // When typeid is applied to an expression other than an glvalue of a // polymorphic class type [...] [the] expression is an unevaluated // operand. [...] if (RecordD->isPolymorphic() && E->isGLValue()) { if (isUnevaluatedContext()) { // The operand was processed in unevaluated context, switch the // context and recheck the subexpression. ExprResult Result = TransformToPotentiallyEvaluated(E); if (Result.isInvalid()) return ExprError(); E = Result.get(); } // We require a vtable to query the type at run time. MarkVTableUsed(TypeidLoc, RecordD); WasEvaluated = true; } } ExprResult Result = CheckUnevaluatedOperand(E); if (Result.isInvalid()) return ExprError(); E = Result.get(); // C++ [expr.typeid]p4: // [...] If the type of the type-id is a reference to a possibly // cv-qualified type, the result of the typeid expression refers to a // std::type_info object representing the cv-unqualified referenced // type. Qualifiers Quals; QualType UnqualT = Context.getUnqualifiedArrayType(T, Quals); if (!Context.hasSameType(T, UnqualT)) { T = UnqualT; E = ImpCastExprToType(E, UnqualT, CK_NoOp, E->getValueKind()).get(); } } if (E->getType()->isVariablyModifiedType()) return ExprError(Diag(TypeidLoc, diag::err_variably_modified_typeid) << E->getType()); else if (!inTemplateInstantiation() && E->HasSideEffects(Context, WasEvaluated)) { // The expression operand for typeid is in an unevaluated expression // context, so side effects could result in unintended consequences. Diag(E->getExprLoc(), WasEvaluated ? diag::warn_side_effects_typeid : diag::warn_side_effects_unevaluated_context); } return new (Context) CXXTypeidExpr(TypeInfoType.withConst(), E, SourceRange(TypeidLoc, RParenLoc)); } /// ActOnCXXTypeidOfType - Parse typeid( type-id ) or typeid (expression); ExprResult Sema::ActOnCXXTypeid(SourceLocation OpLoc, SourceLocation LParenLoc, bool isType, void *TyOrExpr, SourceLocation RParenLoc) { // typeid is not supported in OpenCL. if (getLangOpts().OpenCLCPlusPlus) { return ExprError(Diag(OpLoc, diag::err_openclcxx_not_supported) << "typeid"); } // Find the std::type_info type. if (!getStdNamespace()) return ExprError(Diag(OpLoc, diag::err_need_header_before_typeid)); if (!CXXTypeInfoDecl) { IdentifierInfo *TypeInfoII = &PP.getIdentifierTable().get("type_info"); LookupResult R(*this, TypeInfoII, SourceLocation(), LookupTagName); LookupQualifiedName(R, getStdNamespace()); CXXTypeInfoDecl = R.getAsSingle(); // Microsoft's typeinfo doesn't have type_info in std but in the global // namespace if _HAS_EXCEPTIONS is defined to 0. See PR13153. if (!CXXTypeInfoDecl && LangOpts.MSVCCompat) { LookupQualifiedName(R, Context.getTranslationUnitDecl()); CXXTypeInfoDecl = R.getAsSingle(); } if (!CXXTypeInfoDecl) return ExprError(Diag(OpLoc, diag::err_need_header_before_typeid)); } if (!getLangOpts().RTTI) { return ExprError(Diag(OpLoc, diag::err_no_typeid_with_fno_rtti)); } QualType TypeInfoType = Context.getTypeDeclType(CXXTypeInfoDecl); if (isType) { // The operand is a type; handle it as such. TypeSourceInfo *TInfo = nullptr; QualType T = GetTypeFromParser(ParsedType::getFromOpaquePtr(TyOrExpr), &TInfo); if (T.isNull()) return ExprError(); if (!TInfo) TInfo = Context.getTrivialTypeSourceInfo(T, OpLoc); return BuildCXXTypeId(TypeInfoType, OpLoc, TInfo, RParenLoc); } // The operand is an expression. ExprResult Result = BuildCXXTypeId(TypeInfoType, OpLoc, (Expr *)TyOrExpr, RParenLoc); if (!getLangOpts().RTTIData && !Result.isInvalid()) if (auto *CTE = dyn_cast(Result.get())) if (CTE->isPotentiallyEvaluated() && !CTE->isMostDerived(Context)) Diag(OpLoc, diag::warn_no_typeid_with_rtti_disabled) << (getDiagnostics().getDiagnosticOptions().getFormat() == DiagnosticOptions::MSVC); return Result; } /// Grabs __declspec(uuid()) off a type, or returns 0 if we cannot resolve to /// a single GUID. static void getUuidAttrOfType(Sema &SemaRef, QualType QT, llvm::SmallSetVector &UuidAttrs) { // Optionally remove one level of pointer, reference or array indirection. const Type *Ty = QT.getTypePtr(); if (QT->isPointerType() || QT->isReferenceType()) Ty = QT->getPointeeType().getTypePtr(); else if (QT->isArrayType()) Ty = Ty->getBaseElementTypeUnsafe(); const auto *TD = Ty->getAsTagDecl(); if (!TD) return; if (const auto *Uuid = TD->getMostRecentDecl()->getAttr()) { UuidAttrs.insert(Uuid); return; } // __uuidof can grab UUIDs from template arguments. if (const auto *CTSD = dyn_cast(TD)) { const TemplateArgumentList &TAL = CTSD->getTemplateArgs(); for (const TemplateArgument &TA : TAL.asArray()) { const UuidAttr *UuidForTA = nullptr; if (TA.getKind() == TemplateArgument::Type) getUuidAttrOfType(SemaRef, TA.getAsType(), UuidAttrs); else if (TA.getKind() == TemplateArgument::Declaration) getUuidAttrOfType(SemaRef, TA.getAsDecl()->getType(), UuidAttrs); if (UuidForTA) UuidAttrs.insert(UuidForTA); } } } /// Build a Microsoft __uuidof expression with a type operand. ExprResult Sema::BuildCXXUuidof(QualType Type, SourceLocation TypeidLoc, TypeSourceInfo *Operand, SourceLocation RParenLoc) { MSGuidDecl *Guid = nullptr; if (!Operand->getType()->isDependentType()) { llvm::SmallSetVector UuidAttrs; getUuidAttrOfType(*this, Operand->getType(), UuidAttrs); if (UuidAttrs.empty()) return ExprError(Diag(TypeidLoc, diag::err_uuidof_without_guid)); if (UuidAttrs.size() > 1) return ExprError(Diag(TypeidLoc, diag::err_uuidof_with_multiple_guids)); Guid = UuidAttrs.back()->getGuidDecl(); } return new (Context) CXXUuidofExpr(Type, Operand, Guid, SourceRange(TypeidLoc, RParenLoc)); } /// Build a Microsoft __uuidof expression with an expression operand. ExprResult Sema::BuildCXXUuidof(QualType Type, SourceLocation TypeidLoc, Expr *E, SourceLocation RParenLoc) { MSGuidDecl *Guid = nullptr; if (!E->getType()->isDependentType()) { if (E->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull)) { // A null pointer results in {00000000-0000-0000-0000-000000000000}. Guid = Context.getMSGuidDecl(MSGuidDecl::Parts{}); } else { llvm::SmallSetVector UuidAttrs; getUuidAttrOfType(*this, E->getType(), UuidAttrs); if (UuidAttrs.empty()) return ExprError(Diag(TypeidLoc, diag::err_uuidof_without_guid)); if (UuidAttrs.size() > 1) return ExprError(Diag(TypeidLoc, diag::err_uuidof_with_multiple_guids)); Guid = UuidAttrs.back()->getGuidDecl(); } } return new (Context) CXXUuidofExpr(Type, E, Guid, SourceRange(TypeidLoc, RParenLoc)); } /// ActOnCXXUuidof - Parse __uuidof( type-id ) or __uuidof (expression); ExprResult Sema::ActOnCXXUuidof(SourceLocation OpLoc, SourceLocation LParenLoc, bool isType, void *TyOrExpr, SourceLocation RParenLoc) { QualType GuidType = Context.getMSGuidType(); GuidType.addConst(); if (isType) { // The operand is a type; handle it as such. TypeSourceInfo *TInfo = nullptr; QualType T = GetTypeFromParser(ParsedType::getFromOpaquePtr(TyOrExpr), &TInfo); if (T.isNull()) return ExprError(); if (!TInfo) TInfo = Context.getTrivialTypeSourceInfo(T, OpLoc); return BuildCXXUuidof(GuidType, OpLoc, TInfo, RParenLoc); } // The operand is an expression. return BuildCXXUuidof(GuidType, OpLoc, (Expr*)TyOrExpr, RParenLoc); } /// ActOnCXXBoolLiteral - Parse {true,false} literals. ExprResult Sema::ActOnCXXBoolLiteral(SourceLocation OpLoc, tok::TokenKind Kind) { assert((Kind == tok::kw_true || Kind == tok::kw_false) && "Unknown C++ Boolean value!"); return new (Context) CXXBoolLiteralExpr(Kind == tok::kw_true, Context.BoolTy, OpLoc); } /// ActOnCXXNullPtrLiteral - Parse 'nullptr'. ExprResult Sema::ActOnCXXNullPtrLiteral(SourceLocation Loc) { return new (Context) CXXNullPtrLiteralExpr(Context.NullPtrTy, Loc); } /// ActOnCXXThrow - Parse throw expressions. ExprResult Sema::ActOnCXXThrow(Scope *S, SourceLocation OpLoc, Expr *Ex) { bool IsThrownVarInScope = false; if (Ex) { // C++0x [class.copymove]p31: // When certain criteria are met, an implementation is allowed to omit the // copy/move construction of a class object [...] // // - in a throw-expression, when the operand is the name of a // non-volatile automatic object (other than a function or catch- // clause parameter) whose scope does not extend beyond the end of the // innermost enclosing try-block (if there is one), the copy/move // operation from the operand to the exception object (15.1) can be // omitted by constructing the automatic object directly into the // exception object if (DeclRefExpr *DRE = dyn_cast(Ex->IgnoreParens())) if (VarDecl *Var = dyn_cast(DRE->getDecl())) { if (Var->hasLocalStorage() && !Var->getType().isVolatileQualified()) { for( ; S; S = S->getParent()) { if (S->isDeclScope(Var)) { IsThrownVarInScope = true; break; } // FIXME: Many of the scope checks here seem incorrect. if (S->getFlags() & (Scope::FnScope | Scope::ClassScope | Scope::BlockScope | Scope::ObjCMethodScope | Scope::TryScope)) break; } } } } return BuildCXXThrow(OpLoc, Ex, IsThrownVarInScope); } ExprResult Sema::BuildCXXThrow(SourceLocation OpLoc, Expr *Ex, bool IsThrownVarInScope) { // Don't report an error if 'throw' is used in system headers. if (!getLangOpts().CXXExceptions && !getSourceManager().isInSystemHeader(OpLoc) && !getLangOpts().CUDA) { // Delay error emission for the OpenMP device code. targetDiag(OpLoc, diag::err_exceptions_disabled) << "throw"; } // Exceptions aren't allowed in CUDA device code. if (getLangOpts().CUDA) CUDADiagIfDeviceCode(OpLoc, diag::err_cuda_device_exceptions) << "throw" << CurrentCUDATarget(); if (getCurScope() && getCurScope()->isOpenMPSimdDirectiveScope()) Diag(OpLoc, diag::err_omp_simd_region_cannot_use_stmt) << "throw"; if (Ex && !Ex->isTypeDependent()) { // Initialize the exception result. This implicitly weeds out // abstract types or types with inaccessible copy constructors. // C++0x [class.copymove]p31: // When certain criteria are met, an implementation is allowed to omit the // copy/move construction of a class object [...] // // - in a throw-expression, when the operand is the name of a // non-volatile automatic object (other than a function or // catch-clause // parameter) whose scope does not extend beyond the end of the // innermost enclosing try-block (if there is one), the copy/move // operation from the operand to the exception object (15.1) can be // omitted by constructing the automatic object directly into the // exception object NamedReturnInfo NRInfo = IsThrownVarInScope ? getNamedReturnInfo(Ex) : NamedReturnInfo(); QualType ExceptionObjectTy = Context.getExceptionObjectType(Ex->getType()); if (CheckCXXThrowOperand(OpLoc, ExceptionObjectTy, Ex)) return ExprError(); InitializedEntity Entity = InitializedEntity::InitializeException(OpLoc, ExceptionObjectTy); ExprResult Res = PerformMoveOrCopyInitialization(Entity, NRInfo, Ex); if (Res.isInvalid()) return ExprError(); Ex = Res.get(); } // PPC MMA non-pointer types are not allowed as throw expr types. if (Ex && Context.getTargetInfo().getTriple().isPPC64()) CheckPPCMMAType(Ex->getType(), Ex->getBeginLoc()); return new (Context) CXXThrowExpr(Ex, Context.VoidTy, OpLoc, IsThrownVarInScope); } static void collectPublicBases(CXXRecordDecl *RD, llvm::DenseMap &SubobjectsSeen, llvm::SmallPtrSetImpl &VBases, llvm::SetVector &PublicSubobjectsSeen, bool ParentIsPublic) { for (const CXXBaseSpecifier &BS : RD->bases()) { CXXRecordDecl *BaseDecl = BS.getType()->getAsCXXRecordDecl(); bool NewSubobject; // Virtual bases constitute the same subobject. Non-virtual bases are // always distinct subobjects. if (BS.isVirtual()) NewSubobject = VBases.insert(BaseDecl).second; else NewSubobject = true; if (NewSubobject) ++SubobjectsSeen[BaseDecl]; // Only add subobjects which have public access throughout the entire chain. bool PublicPath = ParentIsPublic && BS.getAccessSpecifier() == AS_public; if (PublicPath) PublicSubobjectsSeen.insert(BaseDecl); // Recurse on to each base subobject. collectPublicBases(BaseDecl, SubobjectsSeen, VBases, PublicSubobjectsSeen, PublicPath); } } static void getUnambiguousPublicSubobjects( CXXRecordDecl *RD, llvm::SmallVectorImpl &Objects) { llvm::DenseMap SubobjectsSeen; llvm::SmallSet VBases; llvm::SetVector PublicSubobjectsSeen; SubobjectsSeen[RD] = 1; PublicSubobjectsSeen.insert(RD); collectPublicBases(RD, SubobjectsSeen, VBases, PublicSubobjectsSeen, /*ParentIsPublic=*/true); for (CXXRecordDecl *PublicSubobject : PublicSubobjectsSeen) { // Skip ambiguous objects. if (SubobjectsSeen[PublicSubobject] > 1) continue; Objects.push_back(PublicSubobject); } } /// CheckCXXThrowOperand - Validate the operand of a throw. bool Sema::CheckCXXThrowOperand(SourceLocation ThrowLoc, QualType ExceptionObjectTy, Expr *E) { // If the type of the exception would be an incomplete type or a pointer // to an incomplete type other than (cv) void the program is ill-formed. QualType Ty = ExceptionObjectTy; bool isPointer = false; if (const PointerType* Ptr = Ty->getAs()) { Ty = Ptr->getPointeeType(); isPointer = true; } // Cannot throw WebAssembly reference type. if (Ty.isWebAssemblyReferenceType()) { Diag(ThrowLoc, diag::err_wasm_reftype_tc) << 0 << E->getSourceRange(); return true; } // Cannot throw WebAssembly table. if (isPointer && Ty.isWebAssemblyReferenceType()) { Diag(ThrowLoc, diag::err_wasm_table_art) << 2 << E->getSourceRange(); return true; } if (!isPointer || !Ty->isVoidType()) { if (RequireCompleteType(ThrowLoc, Ty, isPointer ? diag::err_throw_incomplete_ptr : diag::err_throw_incomplete, E->getSourceRange())) return true; if (!isPointer && Ty->isSizelessType()) { Diag(ThrowLoc, diag::err_throw_sizeless) << Ty << E->getSourceRange(); return true; } if (RequireNonAbstractType(ThrowLoc, ExceptionObjectTy, diag::err_throw_abstract_type, E)) return true; } // If the exception has class type, we need additional handling. CXXRecordDecl *RD = Ty->getAsCXXRecordDecl(); if (!RD) return false; // If we are throwing a polymorphic class type or pointer thereof, // exception handling will make use of the vtable. MarkVTableUsed(ThrowLoc, RD); // If a pointer is thrown, the referenced object will not be destroyed. if (isPointer) return false; // If the class has a destructor, we must be able to call it. if (!RD->hasIrrelevantDestructor()) { if (CXXDestructorDecl *Destructor = LookupDestructor(RD)) { MarkFunctionReferenced(E->getExprLoc(), Destructor); CheckDestructorAccess(E->getExprLoc(), Destructor, PDiag(diag::err_access_dtor_exception) << Ty); if (DiagnoseUseOfDecl(Destructor, E->getExprLoc())) return true; } } // The MSVC ABI creates a list of all types which can catch the exception // object. This list also references the appropriate copy constructor to call // if the object is caught by value and has a non-trivial copy constructor. if (Context.getTargetInfo().getCXXABI().isMicrosoft()) { // We are only interested in the public, unambiguous bases contained within // the exception object. Bases which are ambiguous or otherwise // inaccessible are not catchable types. llvm::SmallVector UnambiguousPublicSubobjects; getUnambiguousPublicSubobjects(RD, UnambiguousPublicSubobjects); for (CXXRecordDecl *Subobject : UnambiguousPublicSubobjects) { // Attempt to lookup the copy constructor. Various pieces of machinery // will spring into action, like template instantiation, which means this // cannot be a simple walk of the class's decls. Instead, we must perform // lookup and overload resolution. CXXConstructorDecl *CD = LookupCopyingConstructor(Subobject, 0); if (!CD || CD->isDeleted()) continue; // Mark the constructor referenced as it is used by this throw expression. MarkFunctionReferenced(E->getExprLoc(), CD); // Skip this copy constructor if it is trivial, we don't need to record it // in the catchable type data. if (CD->isTrivial()) continue; // The copy constructor is non-trivial, create a mapping from this class // type to this constructor. // N.B. The selection of copy constructor is not sensitive to this // particular throw-site. Lookup will be performed at the catch-site to // ensure that the copy constructor is, in fact, accessible (via // friendship or any other means). Context.addCopyConstructorForExceptionObject(Subobject, CD); // We don't keep the instantiated default argument expressions around so // we must rebuild them here. for (unsigned I = 1, E = CD->getNumParams(); I != E; ++I) { if (CheckCXXDefaultArgExpr(ThrowLoc, CD, CD->getParamDecl(I))) return true; } } } // Under the Itanium C++ ABI, memory for the exception object is allocated by // the runtime with no ability for the compiler to request additional // alignment. Warn if the exception type requires alignment beyond the minimum // guaranteed by the target C++ runtime. if (Context.getTargetInfo().getCXXABI().isItaniumFamily()) { CharUnits TypeAlign = Context.getTypeAlignInChars(Ty); CharUnits ExnObjAlign = Context.getExnObjectAlignment(); if (ExnObjAlign < TypeAlign) { Diag(ThrowLoc, diag::warn_throw_underaligned_obj); Diag(ThrowLoc, diag::note_throw_underaligned_obj) << Ty << (unsigned)TypeAlign.getQuantity() << (unsigned)ExnObjAlign.getQuantity(); } } return false; } static QualType adjustCVQualifiersForCXXThisWithinLambda( ArrayRef FunctionScopes, QualType ThisTy, DeclContext *CurSemaContext, ASTContext &ASTCtx) { QualType ClassType = ThisTy->getPointeeType(); LambdaScopeInfo *CurLSI = nullptr; DeclContext *CurDC = CurSemaContext; // Iterate through the stack of lambdas starting from the innermost lambda to // the outermost lambda, checking if '*this' is ever captured by copy - since // that could change the cv-qualifiers of the '*this' object. // The object referred to by '*this' starts out with the cv-qualifiers of its // member function. We then start with the innermost lambda and iterate // outward checking to see if any lambda performs a by-copy capture of '*this' // - and if so, any nested lambda must respect the 'constness' of that // capturing lamdbda's call operator. // // Since the FunctionScopeInfo stack is representative of the lexical // nesting of the lambda expressions during initial parsing (and is the best // place for querying information about captures about lambdas that are // partially processed) and perhaps during instantiation of function templates // that contain lambda expressions that need to be transformed BUT not // necessarily during instantiation of a nested generic lambda's function call // operator (which might even be instantiated at the end of the TU) - at which // time the DeclContext tree is mature enough to query capture information // reliably - we use a two pronged approach to walk through all the lexically // enclosing lambda expressions: // // 1) Climb down the FunctionScopeInfo stack as long as each item represents // a Lambda (i.e. LambdaScopeInfo) AND each LSI's 'closure-type' is lexically // enclosed by the call-operator of the LSI below it on the stack (while // tracking the enclosing DC for step 2 if needed). Note the topmost LSI on // the stack represents the innermost lambda. // // 2) If we run out of enclosing LSI's, check if the enclosing DeclContext // represents a lambda's call operator. If it does, we must be instantiating // a generic lambda's call operator (represented by the Current LSI, and // should be the only scenario where an inconsistency between the LSI and the // DeclContext should occur), so climb out the DeclContexts if they // represent lambdas, while querying the corresponding closure types // regarding capture information. // 1) Climb down the function scope info stack. for (int I = FunctionScopes.size(); I-- && isa(FunctionScopes[I]) && (!CurLSI || !CurLSI->Lambda || CurLSI->Lambda->getDeclContext() == cast(FunctionScopes[I])->CallOperator); CurDC = getLambdaAwareParentOfDeclContext(CurDC)) { CurLSI = cast(FunctionScopes[I]); if (!CurLSI->isCXXThisCaptured()) continue; auto C = CurLSI->getCXXThisCapture(); if (C.isCopyCapture()) { if (!CurLSI->Mutable) ClassType.addConst(); return ASTCtx.getPointerType(ClassType); } } // 2) We've run out of ScopeInfos but check 1. if CurDC is a lambda (which // can happen during instantiation of its nested generic lambda call // operator); 2. if we're in a lambda scope (lambda body). if (CurLSI && isLambdaCallOperator(CurDC)) { assert(isGenericLambdaCallOperatorSpecialization(CurLSI->CallOperator) && "While computing 'this' capture-type for a generic lambda, when we " "run out of enclosing LSI's, yet the enclosing DC is a " "lambda-call-operator we must be (i.e. Current LSI) in a generic " "lambda call oeprator"); assert(CurDC == getLambdaAwareParentOfDeclContext(CurLSI->CallOperator)); auto IsThisCaptured = [](CXXRecordDecl *Closure, bool &IsByCopy, bool &IsConst) { IsConst = false; IsByCopy = false; for (auto &&C : Closure->captures()) { if (C.capturesThis()) { if (C.getCaptureKind() == LCK_StarThis) IsByCopy = true; if (Closure->getLambdaCallOperator()->isConst()) IsConst = true; return true; } } return false; }; bool IsByCopyCapture = false; bool IsConstCapture = false; CXXRecordDecl *Closure = cast(CurDC->getParent()); while (Closure && IsThisCaptured(Closure, IsByCopyCapture, IsConstCapture)) { if (IsByCopyCapture) { if (IsConstCapture) ClassType.addConst(); return ASTCtx.getPointerType(ClassType); } Closure = isLambdaCallOperator(Closure->getParent()) ? cast(Closure->getParent()->getParent()) : nullptr; } } return ASTCtx.getPointerType(ClassType); } QualType Sema::getCurrentThisType() { DeclContext *DC = getFunctionLevelDeclContext(); QualType ThisTy = CXXThisTypeOverride; if (CXXMethodDecl *method = dyn_cast(DC)) { if (method && method->isInstance()) ThisTy = method->getThisType(); } if (ThisTy.isNull() && isLambdaCallOperator(CurContext) && inTemplateInstantiation() && isa(DC)) { // This is a lambda call operator that is being instantiated as a default // initializer. DC must point to the enclosing class type, so we can recover // the 'this' type from it. QualType ClassTy = Context.getTypeDeclType(cast(DC)); // There are no cv-qualifiers for 'this' within default initializers, // per [expr.prim.general]p4. ThisTy = Context.getPointerType(ClassTy); } // If we are within a lambda's call operator, the cv-qualifiers of 'this' // might need to be adjusted if the lambda or any of its enclosing lambda's // captures '*this' by copy. if (!ThisTy.isNull() && isLambdaCallOperator(CurContext)) return adjustCVQualifiersForCXXThisWithinLambda(FunctionScopes, ThisTy, CurContext, Context); return ThisTy; } Sema::CXXThisScopeRAII::CXXThisScopeRAII(Sema &S, Decl *ContextDecl, Qualifiers CXXThisTypeQuals, bool Enabled) : S(S), OldCXXThisTypeOverride(S.CXXThisTypeOverride), Enabled(false) { if (!Enabled || !ContextDecl) return; CXXRecordDecl *Record = nullptr; if (ClassTemplateDecl *Template = dyn_cast(ContextDecl)) Record = Template->getTemplatedDecl(); else Record = cast(ContextDecl); QualType T = S.Context.getRecordType(Record); T = S.getASTContext().getQualifiedType(T, CXXThisTypeQuals); S.CXXThisTypeOverride = S.Context.getPointerType(T); this->Enabled = true; } Sema::CXXThisScopeRAII::~CXXThisScopeRAII() { if (Enabled) { S.CXXThisTypeOverride = OldCXXThisTypeOverride; } } static void buildLambdaThisCaptureFixit(Sema &Sema, LambdaScopeInfo *LSI) { SourceLocation DiagLoc = LSI->IntroducerRange.getEnd(); assert(!LSI->isCXXThisCaptured()); // [=, this] {}; // until C++20: Error: this when = is the default if (LSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_LambdaByval && !Sema.getLangOpts().CPlusPlus20) return; Sema.Diag(DiagLoc, diag::note_lambda_this_capture_fixit) << FixItHint::CreateInsertion( DiagLoc, LSI->NumExplicitCaptures > 0 ? ", this" : "this"); } bool Sema::CheckCXXThisCapture(SourceLocation Loc, const bool Explicit, bool BuildAndDiagnose, const unsigned *const FunctionScopeIndexToStopAt, const bool ByCopy) { // We don't need to capture this in an unevaluated context. if (isUnevaluatedContext() && !Explicit) return true; assert((!ByCopy || Explicit) && "cannot implicitly capture *this by value"); const int MaxFunctionScopesIndex = FunctionScopeIndexToStopAt ? *FunctionScopeIndexToStopAt : FunctionScopes.size() - 1; // Check that we can capture the *enclosing object* (referred to by '*this') // by the capturing-entity/closure (lambda/block/etc) at // MaxFunctionScopesIndex-deep on the FunctionScopes stack. // Note: The *enclosing object* can only be captured by-value by a // closure that is a lambda, using the explicit notation: // [*this] { ... }. // Every other capture of the *enclosing object* results in its by-reference // capture. // For a closure 'L' (at MaxFunctionScopesIndex in the FunctionScopes // stack), we can capture the *enclosing object* only if: // - 'L' has an explicit byref or byval capture of the *enclosing object* // - or, 'L' has an implicit capture. // AND // -- there is no enclosing closure // -- or, there is some enclosing closure 'E' that has already captured the // *enclosing object*, and every intervening closure (if any) between 'E' // and 'L' can implicitly capture the *enclosing object*. // -- or, every enclosing closure can implicitly capture the // *enclosing object* unsigned NumCapturingClosures = 0; for (int idx = MaxFunctionScopesIndex; idx >= 0; idx--) { if (CapturingScopeInfo *CSI = dyn_cast(FunctionScopes[idx])) { if (CSI->CXXThisCaptureIndex != 0) { // 'this' is already being captured; there isn't anything more to do. CSI->Captures[CSI->CXXThisCaptureIndex - 1].markUsed(BuildAndDiagnose); break; } LambdaScopeInfo *LSI = dyn_cast(CSI); if (LSI && isGenericLambdaCallOperatorSpecialization(LSI->CallOperator)) { // This context can't implicitly capture 'this'; fail out. if (BuildAndDiagnose) { Diag(Loc, diag::err_this_capture) << (Explicit && idx == MaxFunctionScopesIndex); if (!Explicit) buildLambdaThisCaptureFixit(*this, LSI); } return true; } if (CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_LambdaByref || CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_LambdaByval || CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_Block || CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_CapturedRegion || (Explicit && idx == MaxFunctionScopesIndex)) { // Regarding (Explicit && idx == MaxFunctionScopesIndex): only the first // iteration through can be an explicit capture, all enclosing closures, // if any, must perform implicit captures. // This closure can capture 'this'; continue looking upwards. NumCapturingClosures++; continue; } // This context can't implicitly capture 'this'; fail out. if (BuildAndDiagnose) Diag(Loc, diag::err_this_capture) << (Explicit && idx == MaxFunctionScopesIndex); if (!Explicit) buildLambdaThisCaptureFixit(*this, LSI); return true; } break; } if (!BuildAndDiagnose) return false; // If we got here, then the closure at MaxFunctionScopesIndex on the // FunctionScopes stack, can capture the *enclosing object*, so capture it // (including implicit by-reference captures in any enclosing closures). // In the loop below, respect the ByCopy flag only for the closure requesting // the capture (i.e. first iteration through the loop below). Ignore it for // all enclosing closure's up to NumCapturingClosures (since they must be // implicitly capturing the *enclosing object* by reference (see loop // above)). assert((!ByCopy || isa(FunctionScopes[MaxFunctionScopesIndex])) && "Only a lambda can capture the enclosing object (referred to by " "*this) by copy"); QualType ThisTy = getCurrentThisType(); for (int idx = MaxFunctionScopesIndex; NumCapturingClosures; --idx, --NumCapturingClosures) { CapturingScopeInfo *CSI = cast(FunctionScopes[idx]); // The type of the corresponding data member (not a 'this' pointer if 'by // copy'). QualType CaptureType = ByCopy ? ThisTy->getPointeeType() : ThisTy; bool isNested = NumCapturingClosures > 1; CSI->addThisCapture(isNested, Loc, CaptureType, ByCopy); } return false; } ExprResult Sema::ActOnCXXThis(SourceLocation Loc) { /// C++ 9.3.2: In the body of a non-static member function, the keyword this /// is a non-lvalue expression whose value is the address of the object for /// which the function is called. QualType ThisTy = getCurrentThisType(); if (ThisTy.isNull()) return Diag(Loc, diag::err_invalid_this_use); return BuildCXXThisExpr(Loc, ThisTy, /*IsImplicit=*/false); } Expr *Sema::BuildCXXThisExpr(SourceLocation Loc, QualType Type, bool IsImplicit) { if (getLangOpts().HLSL && Type.getTypePtr()->isPointerType()) { auto *This = new (Context) CXXThisExpr(Loc, Type.getTypePtr()->getPointeeType(), IsImplicit); This->setValueKind(ExprValueKind::VK_LValue); MarkThisReferenced(This); return This; } auto *This = new (Context) CXXThisExpr(Loc, Type, IsImplicit); MarkThisReferenced(This); return This; } void Sema::MarkThisReferenced(CXXThisExpr *This) { CheckCXXThisCapture(This->getExprLoc()); } bool Sema::isThisOutsideMemberFunctionBody(QualType BaseType) { // If we're outside the body of a member function, then we'll have a specified // type for 'this'. if (CXXThisTypeOverride.isNull()) return false; // Determine whether we're looking into a class that's currently being // defined. CXXRecordDecl *Class = BaseType->getAsCXXRecordDecl(); return Class && Class->isBeingDefined(); } /// Parse construction of a specified type. /// Can be interpreted either as function-style casting ("int(x)") /// or class type construction ("ClassType(x,y,z)") /// or creation of a value-initialized type ("int()"). ExprResult Sema::ActOnCXXTypeConstructExpr(ParsedType TypeRep, SourceLocation LParenOrBraceLoc, MultiExprArg exprs, SourceLocation RParenOrBraceLoc, bool ListInitialization) { if (!TypeRep) return ExprError(); TypeSourceInfo *TInfo; QualType Ty = GetTypeFromParser(TypeRep, &TInfo); if (!TInfo) TInfo = Context.getTrivialTypeSourceInfo(Ty, SourceLocation()); auto Result = BuildCXXTypeConstructExpr(TInfo, LParenOrBraceLoc, exprs, RParenOrBraceLoc, ListInitialization); // Avoid creating a non-type-dependent expression that contains typos. // Non-type-dependent expressions are liable to be discarded without // checking for embedded typos. if (!Result.isInvalid() && Result.get()->isInstantiationDependent() && !Result.get()->isTypeDependent()) Result = CorrectDelayedTyposInExpr(Result.get()); else if (Result.isInvalid()) Result = CreateRecoveryExpr(TInfo->getTypeLoc().getBeginLoc(), RParenOrBraceLoc, exprs, Ty); return Result; } ExprResult Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo, SourceLocation LParenOrBraceLoc, MultiExprArg Exprs, SourceLocation RParenOrBraceLoc, bool ListInitialization) { QualType Ty = TInfo->getType(); SourceLocation TyBeginLoc = TInfo->getTypeLoc().getBeginLoc(); assert((!ListInitialization || Exprs.size() == 1) && "List initialization must have exactly one expression."); SourceRange FullRange = SourceRange(TyBeginLoc, RParenOrBraceLoc); InitializedEntity Entity = InitializedEntity::InitializeTemporary(Context, TInfo); InitializationKind Kind = Exprs.size() ? ListInitialization ? InitializationKind::CreateDirectList( TyBeginLoc, LParenOrBraceLoc, RParenOrBraceLoc) : InitializationKind::CreateDirect(TyBeginLoc, LParenOrBraceLoc, RParenOrBraceLoc) : InitializationKind::CreateValue(TyBeginLoc, LParenOrBraceLoc, RParenOrBraceLoc); // C++17 [expr.type.conv]p1: // If the type is a placeholder for a deduced class type, [...perform class // template argument deduction...] // C++23: // Otherwise, if the type contains a placeholder type, it is replaced by the // type determined by placeholder type deduction. DeducedType *Deduced = Ty->getContainedDeducedType(); if (Deduced && !Deduced->isDeduced() && isa(Deduced)) { Ty = DeduceTemplateSpecializationFromInitializer(TInfo, Entity, Kind, Exprs); if (Ty.isNull()) return ExprError(); Entity = InitializedEntity::InitializeTemporary(TInfo, Ty); } else if (Deduced && !Deduced->isDeduced()) { MultiExprArg Inits = Exprs; if (ListInitialization) { auto *ILE = cast(Exprs[0]); Inits = MultiExprArg(ILE->getInits(), ILE->getNumInits()); } if (Inits.empty()) return ExprError(Diag(TyBeginLoc, diag::err_auto_expr_init_no_expression) << Ty << FullRange); if (Inits.size() > 1) { Expr *FirstBad = Inits[1]; return ExprError(Diag(FirstBad->getBeginLoc(), diag::err_auto_expr_init_multiple_expressions) << Ty << FullRange); } if (getLangOpts().CPlusPlus23) { if (Ty->getAs()) Diag(TyBeginLoc, diag::warn_cxx20_compat_auto_expr) << FullRange; } Expr *Deduce = Inits[0]; if (isa(Deduce)) return ExprError( Diag(Deduce->getBeginLoc(), diag::err_auto_expr_init_paren_braces) << ListInitialization << Ty << FullRange); QualType DeducedType; TemplateDeductionInfo Info(Deduce->getExprLoc()); TemplateDeductionResult Result = DeduceAutoType(TInfo->getTypeLoc(), Deduce, DeducedType, Info); if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed) return ExprError(Diag(TyBeginLoc, diag::err_auto_expr_deduction_failure) << Ty << Deduce->getType() << FullRange << Deduce->getSourceRange()); if (DeducedType.isNull()) { assert(Result == TDK_AlreadyDiagnosed); return ExprError(); } Ty = DeducedType; Entity = InitializedEntity::InitializeTemporary(TInfo, Ty); } if (Ty->isDependentType() || CallExpr::hasAnyTypeDependentArguments(Exprs)) return CXXUnresolvedConstructExpr::Create( Context, Ty.getNonReferenceType(), TInfo, LParenOrBraceLoc, Exprs, RParenOrBraceLoc, ListInitialization); // C++ [expr.type.conv]p1: // If the expression list is a parenthesized single expression, the type // conversion expression is equivalent (in definedness, and if defined in // meaning) to the corresponding cast expression. if (Exprs.size() == 1 && !ListInitialization && !isa(Exprs[0])) { Expr *Arg = Exprs[0]; return BuildCXXFunctionalCastExpr(TInfo, Ty, LParenOrBraceLoc, Arg, RParenOrBraceLoc); } // For an expression of the form T(), T shall not be an array type. QualType ElemTy = Ty; if (Ty->isArrayType()) { if (!ListInitialization) return ExprError(Diag(TyBeginLoc, diag::err_value_init_for_array_type) << FullRange); ElemTy = Context.getBaseElementType(Ty); } // Only construct objects with object types. // The standard doesn't explicitly forbid function types here, but that's an // obvious oversight, as there's no way to dynamically construct a function // in general. if (Ty->isFunctionType()) return ExprError(Diag(TyBeginLoc, diag::err_init_for_function_type) << Ty << FullRange); // C++17 [expr.type.conv]p2: // If the type is cv void and the initializer is (), the expression is a // prvalue of the specified type that performs no initialization. if (!Ty->isVoidType() && RequireCompleteType(TyBeginLoc, ElemTy, diag::err_invalid_incomplete_type_use, FullRange)) return ExprError(); // Otherwise, the expression is a prvalue of the specified type whose // result object is direct-initialized (11.6) with the initializer. InitializationSequence InitSeq(*this, Entity, Kind, Exprs); ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Exprs); if (Result.isInvalid()) return Result; Expr *Inner = Result.get(); if (CXXBindTemporaryExpr *BTE = dyn_cast_or_null(Inner)) Inner = BTE->getSubExpr(); if (auto *CE = dyn_cast(Inner); CE && CE->isImmediateInvocation()) Inner = CE->getSubExpr(); if (!isa(Inner) && !isa(Inner)) { // If we created a CXXTemporaryObjectExpr, that node also represents the // functional cast. Otherwise, create an explicit cast to represent // the syntactic form of a functional-style cast that was used here. // // FIXME: Creating a CXXFunctionalCastExpr around a CXXConstructExpr // would give a more consistent AST representation than using a // CXXTemporaryObjectExpr. It's also weird that the functional cast // is sometimes handled by initialization and sometimes not. QualType ResultType = Result.get()->getType(); SourceRange Locs = ListInitialization ? SourceRange() : SourceRange(LParenOrBraceLoc, RParenOrBraceLoc); Result = CXXFunctionalCastExpr::Create( Context, ResultType, Expr::getValueKindForType(Ty), TInfo, CK_NoOp, Result.get(), /*Path=*/nullptr, CurFPFeatureOverrides(), Locs.getBegin(), Locs.getEnd()); } return Result; } bool Sema::isUsualDeallocationFunction(const CXXMethodDecl *Method) { // [CUDA] Ignore this function, if we can't call it. const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true); if (getLangOpts().CUDA) { auto CallPreference = IdentifyCUDAPreference(Caller, Method); // If it's not callable at all, it's not the right function. if (CallPreference < CFP_WrongSide) return false; if (CallPreference == CFP_WrongSide) { // Maybe. We have to check if there are better alternatives. DeclContext::lookup_result R = Method->getDeclContext()->lookup(Method->getDeclName()); for (const auto *D : R) { if (const auto *FD = dyn_cast(D)) { if (IdentifyCUDAPreference(Caller, FD) > CFP_WrongSide) return false; } } // We've found no better variants. } } SmallVector PreventedBy; bool Result = Method->isUsualDeallocationFunction(PreventedBy); if (Result || !getLangOpts().CUDA || PreventedBy.empty()) return Result; // In case of CUDA, return true if none of the 1-argument deallocator // functions are actually callable. return llvm::none_of(PreventedBy, [&](const FunctionDecl *FD) { assert(FD->getNumParams() == 1 && "Only single-operand functions should be in PreventedBy"); return IdentifyCUDAPreference(Caller, FD) >= CFP_HostDevice; }); } /// Determine whether the given function is a non-placement /// deallocation function. static bool isNonPlacementDeallocationFunction(Sema &S, FunctionDecl *FD) { if (CXXMethodDecl *Method = dyn_cast(FD)) return S.isUsualDeallocationFunction(Method); if (FD->getOverloadedOperator() != OO_Delete && FD->getOverloadedOperator() != OO_Array_Delete) return false; unsigned UsualParams = 1; if (S.getLangOpts().SizedDeallocation && UsualParams < FD->getNumParams() && S.Context.hasSameUnqualifiedType( FD->getParamDecl(UsualParams)->getType(), S.Context.getSizeType())) ++UsualParams; if (S.getLangOpts().AlignedAllocation && UsualParams < FD->getNumParams() && S.Context.hasSameUnqualifiedType( FD->getParamDecl(UsualParams)->getType(), S.Context.getTypeDeclType(S.getStdAlignValT()))) ++UsualParams; return UsualParams == FD->getNumParams(); } namespace { struct UsualDeallocFnInfo { UsualDeallocFnInfo() : Found(), FD(nullptr) {} UsualDeallocFnInfo(Sema &S, DeclAccessPair Found) : Found(Found), FD(dyn_cast(Found->getUnderlyingDecl())), Destroying(false), HasSizeT(false), HasAlignValT(false), CUDAPref(Sema::CFP_Native) { // A function template declaration is never a usual deallocation function. if (!FD) return; unsigned NumBaseParams = 1; if (FD->isDestroyingOperatorDelete()) { Destroying = true; ++NumBaseParams; } if (NumBaseParams < FD->getNumParams() && S.Context.hasSameUnqualifiedType( FD->getParamDecl(NumBaseParams)->getType(), S.Context.getSizeType())) { ++NumBaseParams; HasSizeT = true; } if (NumBaseParams < FD->getNumParams() && FD->getParamDecl(NumBaseParams)->getType()->isAlignValT()) { ++NumBaseParams; HasAlignValT = true; } // In CUDA, determine how much we'd like / dislike to call this. if (S.getLangOpts().CUDA) if (auto *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true)) CUDAPref = S.IdentifyCUDAPreference(Caller, FD); } explicit operator bool() const { return FD; } bool isBetterThan(const UsualDeallocFnInfo &Other, bool WantSize, bool WantAlign) const { // C++ P0722: // A destroying operator delete is preferred over a non-destroying // operator delete. if (Destroying != Other.Destroying) return Destroying; // C++17 [expr.delete]p10: // If the type has new-extended alignment, a function with a parameter // of type std::align_val_t is preferred; otherwise a function without // such a parameter is preferred if (HasAlignValT != Other.HasAlignValT) return HasAlignValT == WantAlign; if (HasSizeT != Other.HasSizeT) return HasSizeT == WantSize; // Use CUDA call preference as a tiebreaker. return CUDAPref > Other.CUDAPref; } DeclAccessPair Found; FunctionDecl *FD; bool Destroying, HasSizeT, HasAlignValT; Sema::CUDAFunctionPreference CUDAPref; }; } /// Determine whether a type has new-extended alignment. This may be called when /// the type is incomplete (for a delete-expression with an incomplete pointee /// type), in which case it will conservatively return false if the alignment is /// not known. static bool hasNewExtendedAlignment(Sema &S, QualType AllocType) { return S.getLangOpts().AlignedAllocation && S.getASTContext().getTypeAlignIfKnown(AllocType) > S.getASTContext().getTargetInfo().getNewAlign(); } /// Select the correct "usual" deallocation function to use from a selection of /// deallocation functions (either global or class-scope). static UsualDeallocFnInfo resolveDeallocationOverload( Sema &S, LookupResult &R, bool WantSize, bool WantAlign, llvm::SmallVectorImpl *BestFns = nullptr) { UsualDeallocFnInfo Best; for (auto I = R.begin(), E = R.end(); I != E; ++I) { UsualDeallocFnInfo Info(S, I.getPair()); if (!Info || !isNonPlacementDeallocationFunction(S, Info.FD) || Info.CUDAPref == Sema::CFP_Never) continue; if (!Best) { Best = Info; if (BestFns) BestFns->push_back(Info); continue; } if (Best.isBetterThan(Info, WantSize, WantAlign)) continue; // If more than one preferred function is found, all non-preferred // functions are eliminated from further consideration. if (BestFns && Info.isBetterThan(Best, WantSize, WantAlign)) BestFns->clear(); Best = Info; if (BestFns) BestFns->push_back(Info); } return Best; } /// Determine whether a given type is a class for which 'delete[]' would call /// a member 'operator delete[]' with a 'size_t' parameter. This implies that /// we need to store the array size (even if the type is /// trivially-destructible). static bool doesUsualArrayDeleteWantSize(Sema &S, SourceLocation loc, QualType allocType) { const RecordType *record = allocType->getBaseElementTypeUnsafe()->getAs(); if (!record) return false; // Try to find an operator delete[] in class scope. DeclarationName deleteName = S.Context.DeclarationNames.getCXXOperatorName(OO_Array_Delete); LookupResult ops(S, deleteName, loc, Sema::LookupOrdinaryName); S.LookupQualifiedName(ops, record->getDecl()); // We're just doing this for information. ops.suppressDiagnostics(); // Very likely: there's no operator delete[]. if (ops.empty()) return false; // If it's ambiguous, it should be illegal to call operator delete[] // on this thing, so it doesn't matter if we allocate extra space or not. if (ops.isAmbiguous()) return false; // C++17 [expr.delete]p10: // If the deallocation functions have class scope, the one without a // parameter of type std::size_t is selected. auto Best = resolveDeallocationOverload( S, ops, /*WantSize*/false, /*WantAlign*/hasNewExtendedAlignment(S, allocType)); return Best && Best.HasSizeT; } /// Parsed a C++ 'new' expression (C++ 5.3.4). /// /// E.g.: /// @code new (memory) int[size][4] @endcode /// or /// @code ::new Foo(23, "hello") @endcode /// /// \param StartLoc The first location of the expression. /// \param UseGlobal True if 'new' was prefixed with '::'. /// \param PlacementLParen Opening paren of the placement arguments. /// \param PlacementArgs Placement new arguments. /// \param PlacementRParen Closing paren of the placement arguments. /// \param TypeIdParens If the type is in parens, the source range. /// \param D The type to be allocated, as well as array dimensions. /// \param Initializer The initializing expression or initializer-list, or null /// if there is none. ExprResult Sema::ActOnCXXNew(SourceLocation StartLoc, bool UseGlobal, SourceLocation PlacementLParen, MultiExprArg PlacementArgs, SourceLocation PlacementRParen, SourceRange TypeIdParens, Declarator &D, Expr *Initializer) { std::optional ArraySize; // If the specified type is an array, unwrap it and save the expression. if (D.getNumTypeObjects() > 0 && D.getTypeObject(0).Kind == DeclaratorChunk::Array) { DeclaratorChunk &Chunk = D.getTypeObject(0); if (D.getDeclSpec().hasAutoTypeSpec()) return ExprError(Diag(Chunk.Loc, diag::err_new_array_of_auto) << D.getSourceRange()); if (Chunk.Arr.hasStatic) return ExprError(Diag(Chunk.Loc, diag::err_static_illegal_in_new) << D.getSourceRange()); if (!Chunk.Arr.NumElts && !Initializer) return ExprError(Diag(Chunk.Loc, diag::err_array_new_needs_size) << D.getSourceRange()); ArraySize = static_cast(Chunk.Arr.NumElts); D.DropFirstTypeObject(); } // Every dimension shall be of constant size. if (ArraySize) { for (unsigned I = 0, N = D.getNumTypeObjects(); I < N; ++I) { if (D.getTypeObject(I).Kind != DeclaratorChunk::Array) break; DeclaratorChunk::ArrayTypeInfo &Array = D.getTypeObject(I).Arr; if (Expr *NumElts = (Expr *)Array.NumElts) { if (!NumElts->isTypeDependent() && !NumElts->isValueDependent()) { // FIXME: GCC permits constant folding here. We should either do so consistently // or not do so at all, rather than changing behavior in C++14 onwards. if (getLangOpts().CPlusPlus14) { // C++1y [expr.new]p6: Every constant-expression in a noptr-new-declarator // shall be a converted constant expression (5.19) of type std::size_t // and shall evaluate to a strictly positive value. llvm::APSInt Value(Context.getIntWidth(Context.getSizeType())); Array.NumElts = CheckConvertedConstantExpression(NumElts, Context.getSizeType(), Value, CCEK_ArrayBound) .get(); } else { Array.NumElts = VerifyIntegerConstantExpression( NumElts, nullptr, diag::err_new_array_nonconst, AllowFold) .get(); } if (!Array.NumElts) return ExprError(); } } } } TypeSourceInfo *TInfo = GetTypeForDeclarator(D, /*Scope=*/nullptr); QualType AllocType = TInfo->getType(); if (D.isInvalidType()) return ExprError(); SourceRange DirectInitRange; if (ParenListExpr *List = dyn_cast_or_null(Initializer)) DirectInitRange = List->getSourceRange(); return BuildCXXNew(SourceRange(StartLoc, D.getEndLoc()), UseGlobal, PlacementLParen, PlacementArgs, PlacementRParen, TypeIdParens, AllocType, TInfo, ArraySize, DirectInitRange, Initializer); } static bool isLegalArrayNewInitializer(CXXNewExpr::InitializationStyle Style, Expr *Init) { if (!Init) return true; if (ParenListExpr *PLE = dyn_cast(Init)) return PLE->getNumExprs() == 0; if (isa(Init)) return true; else if (CXXConstructExpr *CCE = dyn_cast(Init)) return !CCE->isListInitialization() && CCE->getConstructor()->isDefaultConstructor(); else if (Style == CXXNewExpr::ListInit) { assert(isa(Init) && "Shouldn't create list CXXConstructExprs for arrays."); return true; } return false; } bool Sema::isUnavailableAlignedAllocationFunction(const FunctionDecl &FD) const { if (!getLangOpts().AlignedAllocationUnavailable) return false; if (FD.isDefined()) return false; std::optional AlignmentParam; if (FD.isReplaceableGlobalAllocationFunction(&AlignmentParam) && AlignmentParam) return true; return false; } // Emit a diagnostic if an aligned allocation/deallocation function that is not // implemented in the standard library is selected. void Sema::diagnoseUnavailableAlignedAllocation(const FunctionDecl &FD, SourceLocation Loc) { if (isUnavailableAlignedAllocationFunction(FD)) { const llvm::Triple &T = getASTContext().getTargetInfo().getTriple(); StringRef OSName = AvailabilityAttr::getPlatformNameSourceSpelling( getASTContext().getTargetInfo().getPlatformName()); VersionTuple OSVersion = alignedAllocMinVersion(T.getOS()); OverloadedOperatorKind Kind = FD.getDeclName().getCXXOverloadedOperator(); bool IsDelete = Kind == OO_Delete || Kind == OO_Array_Delete; Diag(Loc, diag::err_aligned_allocation_unavailable) << IsDelete << FD.getType().getAsString() << OSName << OSVersion.getAsString() << OSVersion.empty(); Diag(Loc, diag::note_silence_aligned_allocation_unavailable); } } ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal, SourceLocation PlacementLParen, MultiExprArg PlacementArgs, SourceLocation PlacementRParen, SourceRange TypeIdParens, QualType AllocType, TypeSourceInfo *AllocTypeInfo, std::optional ArraySize, SourceRange DirectInitRange, Expr *Initializer) { SourceRange TypeRange = AllocTypeInfo->getTypeLoc().getSourceRange(); SourceLocation StartLoc = Range.getBegin(); CXXNewExpr::InitializationStyle initStyle; if (DirectInitRange.isValid()) { assert(Initializer && "Have parens but no initializer."); initStyle = CXXNewExpr::CallInit; } else if (Initializer && isa(Initializer)) initStyle = CXXNewExpr::ListInit; else { assert((!Initializer || isa(Initializer) || isa(Initializer)) && "Initializer expression that cannot have been implicitly created."); initStyle = CXXNewExpr::NoInit; } MultiExprArg Exprs(&Initializer, Initializer ? 1 : 0); if (ParenListExpr *List = dyn_cast_or_null(Initializer)) { assert(initStyle == CXXNewExpr::CallInit && "paren init for non-call init"); Exprs = MultiExprArg(List->getExprs(), List->getNumExprs()); } // C++11 [expr.new]p15: // A new-expression that creates an object of type T initializes that // object as follows: InitializationKind Kind // - If the new-initializer is omitted, the object is default- // initialized (8.5); if no initialization is performed, // the object has indeterminate value = initStyle == CXXNewExpr::NoInit ? InitializationKind::CreateDefault(TypeRange.getBegin()) // - Otherwise, the new-initializer is interpreted according to // the // initialization rules of 8.5 for direct-initialization. : initStyle == CXXNewExpr::ListInit ? InitializationKind::CreateDirectList( TypeRange.getBegin(), Initializer->getBeginLoc(), Initializer->getEndLoc()) : InitializationKind::CreateDirect(TypeRange.getBegin(), DirectInitRange.getBegin(), DirectInitRange.getEnd()); // C++11 [dcl.spec.auto]p6. Deduce the type which 'auto' stands in for. auto *Deduced = AllocType->getContainedDeducedType(); if (Deduced && !Deduced->isDeduced() && isa(Deduced)) { if (ArraySize) return ExprError( Diag(*ArraySize ? (*ArraySize)->getExprLoc() : TypeRange.getBegin(), diag::err_deduced_class_template_compound_type) << /*array*/ 2 << (*ArraySize ? (*ArraySize)->getSourceRange() : TypeRange)); InitializedEntity Entity = InitializedEntity::InitializeNew(StartLoc, AllocType); AllocType = DeduceTemplateSpecializationFromInitializer( AllocTypeInfo, Entity, Kind, Exprs); if (AllocType.isNull()) return ExprError(); } else if (Deduced && !Deduced->isDeduced()) { MultiExprArg Inits = Exprs; bool Braced = (initStyle == CXXNewExpr::ListInit); if (Braced) { auto *ILE = cast(Exprs[0]); Inits = MultiExprArg(ILE->getInits(), ILE->getNumInits()); } if (initStyle == CXXNewExpr::NoInit || Inits.empty()) return ExprError(Diag(StartLoc, diag::err_auto_new_requires_ctor_arg) << AllocType << TypeRange); if (Inits.size() > 1) { Expr *FirstBad = Inits[1]; return ExprError(Diag(FirstBad->getBeginLoc(), diag::err_auto_new_ctor_multiple_expressions) << AllocType << TypeRange); } if (Braced && !getLangOpts().CPlusPlus17) Diag(Initializer->getBeginLoc(), diag::ext_auto_new_list_init) << AllocType << TypeRange; Expr *Deduce = Inits[0]; if (isa(Deduce)) return ExprError( Diag(Deduce->getBeginLoc(), diag::err_auto_expr_init_paren_braces) << Braced << AllocType << TypeRange); QualType DeducedType; TemplateDeductionInfo Info(Deduce->getExprLoc()); TemplateDeductionResult Result = DeduceAutoType(AllocTypeInfo->getTypeLoc(), Deduce, DeducedType, Info); if (Result != TDK_Success && Result != TDK_AlreadyDiagnosed) return ExprError(Diag(StartLoc, diag::err_auto_new_deduction_failure) << AllocType << Deduce->getType() << TypeRange << Deduce->getSourceRange()); if (DeducedType.isNull()) { assert(Result == TDK_AlreadyDiagnosed); return ExprError(); } AllocType = DeducedType; } // Per C++0x [expr.new]p5, the type being constructed may be a // typedef of an array type. if (!ArraySize) { if (const ConstantArrayType *Array = Context.getAsConstantArrayType(AllocType)) { ArraySize = IntegerLiteral::Create(Context, Array->getSize(), Context.getSizeType(), TypeRange.getEnd()); AllocType = Array->getElementType(); } } if (CheckAllocatedType(AllocType, TypeRange.getBegin(), TypeRange)) return ExprError(); if (ArraySize && !checkArrayElementAlignment(AllocType, TypeRange.getBegin())) return ExprError(); // In ARC, infer 'retaining' for the allocated if (getLangOpts().ObjCAutoRefCount && AllocType.getObjCLifetime() == Qualifiers::OCL_None && AllocType->isObjCLifetimeType()) { AllocType = Context.getLifetimeQualifiedType(AllocType, AllocType->getObjCARCImplicitLifetime()); } QualType ResultType = Context.getPointerType(AllocType); if (ArraySize && *ArraySize && (*ArraySize)->getType()->isNonOverloadPlaceholderType()) { ExprResult result = CheckPlaceholderExpr(*ArraySize); if (result.isInvalid()) return ExprError(); ArraySize = result.get(); } // C++98 5.3.4p6: "The expression in a direct-new-declarator shall have // integral or enumeration type with a non-negative value." // C++11 [expr.new]p6: The expression [...] shall be of integral or unscoped // enumeration type, or a class type for which a single non-explicit // conversion function to integral or unscoped enumeration type exists. // C++1y [expr.new]p6: The expression [...] is implicitly converted to // std::size_t. std::optional KnownArraySize; if (ArraySize && *ArraySize && !(*ArraySize)->isTypeDependent()) { ExprResult ConvertedSize; if (getLangOpts().CPlusPlus14) { assert(Context.getTargetInfo().getIntWidth() && "Builtin type of size 0?"); ConvertedSize = PerformImplicitConversion(*ArraySize, Context.getSizeType(), AA_Converting); if (!ConvertedSize.isInvalid() && (*ArraySize)->getType()->getAs()) // Diagnose the compatibility of this conversion. Diag(StartLoc, diag::warn_cxx98_compat_array_size_conversion) << (*ArraySize)->getType() << 0 << "'size_t'"; } else { class SizeConvertDiagnoser : public ICEConvertDiagnoser { protected: Expr *ArraySize; public: SizeConvertDiagnoser(Expr *ArraySize) : ICEConvertDiagnoser(/*AllowScopedEnumerations*/false, false, false), ArraySize(ArraySize) {} SemaDiagnosticBuilder diagnoseNotInt(Sema &S, SourceLocation Loc, QualType T) override { return S.Diag(Loc, diag::err_array_size_not_integral) << S.getLangOpts().CPlusPlus11 << T; } SemaDiagnosticBuilder diagnoseIncomplete( Sema &S, SourceLocation Loc, QualType T) override { return S.Diag(Loc, diag::err_array_size_incomplete_type) << T << ArraySize->getSourceRange(); } SemaDiagnosticBuilder diagnoseExplicitConv( Sema &S, SourceLocation Loc, QualType T, QualType ConvTy) override { return S.Diag(Loc, diag::err_array_size_explicit_conversion) << T << ConvTy; } SemaDiagnosticBuilder noteExplicitConv( Sema &S, CXXConversionDecl *Conv, QualType ConvTy) override { return S.Diag(Conv->getLocation(), diag::note_array_size_conversion) << ConvTy->isEnumeralType() << ConvTy; } SemaDiagnosticBuilder diagnoseAmbiguous( Sema &S, SourceLocation Loc, QualType T) override { return S.Diag(Loc, diag::err_array_size_ambiguous_conversion) << T; } SemaDiagnosticBuilder noteAmbiguous( Sema &S, CXXConversionDecl *Conv, QualType ConvTy) override { return S.Diag(Conv->getLocation(), diag::note_array_size_conversion) << ConvTy->isEnumeralType() << ConvTy; } SemaDiagnosticBuilder diagnoseConversion(Sema &S, SourceLocation Loc, QualType T, QualType ConvTy) override { return S.Diag(Loc, S.getLangOpts().CPlusPlus11 ? diag::warn_cxx98_compat_array_size_conversion : diag::ext_array_size_conversion) << T << ConvTy->isEnumeralType() << ConvTy; } } SizeDiagnoser(*ArraySize); ConvertedSize = PerformContextualImplicitConversion(StartLoc, *ArraySize, SizeDiagnoser); } if (ConvertedSize.isInvalid()) return ExprError(); ArraySize = ConvertedSize.get(); QualType SizeType = (*ArraySize)->getType(); if (!SizeType->isIntegralOrUnscopedEnumerationType()) return ExprError(); // C++98 [expr.new]p7: // The expression in a direct-new-declarator shall have integral type // with a non-negative value. // // Let's see if this is a constant < 0. If so, we reject it out of hand, // per CWG1464. Otherwise, if it's not a constant, we must have an // unparenthesized array type. // We've already performed any required implicit conversion to integer or // unscoped enumeration type. // FIXME: Per CWG1464, we are required to check the value prior to // converting to size_t. This will never find a negative array size in // C++14 onwards, because Value is always unsigned here! if (std::optional Value = (*ArraySize)->getIntegerConstantExpr(Context)) { if (Value->isSigned() && Value->isNegative()) { return ExprError(Diag((*ArraySize)->getBeginLoc(), diag::err_typecheck_negative_array_size) << (*ArraySize)->getSourceRange()); } if (!AllocType->isDependentType()) { unsigned ActiveSizeBits = ConstantArrayType::getNumAddressingBits(Context, AllocType, *Value); if (ActiveSizeBits > ConstantArrayType::getMaxSizeBits(Context)) return ExprError( Diag((*ArraySize)->getBeginLoc(), diag::err_array_too_large) << toString(*Value, 10) << (*ArraySize)->getSourceRange()); } KnownArraySize = Value->getZExtValue(); } else if (TypeIdParens.isValid()) { // Can't have dynamic array size when the type-id is in parentheses. Diag((*ArraySize)->getBeginLoc(), diag::ext_new_paren_array_nonconst) << (*ArraySize)->getSourceRange() << FixItHint::CreateRemoval(TypeIdParens.getBegin()) << FixItHint::CreateRemoval(TypeIdParens.getEnd()); TypeIdParens = SourceRange(); } // Note that we do *not* convert the argument in any way. It can // be signed, larger than size_t, whatever. } FunctionDecl *OperatorNew = nullptr; FunctionDecl *OperatorDelete = nullptr; unsigned Alignment = AllocType->isDependentType() ? 0 : Context.getTypeAlign(AllocType); unsigned NewAlignment = Context.getTargetInfo().getNewAlign(); bool PassAlignment = getLangOpts().AlignedAllocation && Alignment > NewAlignment; AllocationFunctionScope Scope = UseGlobal ? AFS_Global : AFS_Both; if (!AllocType->isDependentType() && !Expr::hasAnyTypeDependentArguments(PlacementArgs) && FindAllocationFunctions( StartLoc, SourceRange(PlacementLParen, PlacementRParen), Scope, Scope, AllocType, ArraySize.has_value(), PassAlignment, PlacementArgs, OperatorNew, OperatorDelete)) return ExprError(); // If this is an array allocation, compute whether the usual array // deallocation function for the type has a size_t parameter. bool UsualArrayDeleteWantsSize = false; if (ArraySize && !AllocType->isDependentType()) UsualArrayDeleteWantsSize = doesUsualArrayDeleteWantSize(*this, StartLoc, AllocType); SmallVector AllPlaceArgs; if (OperatorNew) { auto *Proto = OperatorNew->getType()->castAs(); VariadicCallType CallType = Proto->isVariadic() ? VariadicFunction : VariadicDoesNotApply; // We've already converted the placement args, just fill in any default // arguments. Skip the first parameter because we don't have a corresponding // argument. Skip the second parameter too if we're passing in the // alignment; we've already filled it in. unsigned NumImplicitArgs = PassAlignment ? 2 : 1; if (GatherArgumentsForCall(PlacementLParen, OperatorNew, Proto, NumImplicitArgs, PlacementArgs, AllPlaceArgs, CallType)) return ExprError(); if (!AllPlaceArgs.empty()) PlacementArgs = AllPlaceArgs; // We would like to perform some checking on the given `operator new` call, // but the PlacementArgs does not contain the implicit arguments, // namely allocation size and maybe allocation alignment, // so we need to conjure them. QualType SizeTy = Context.getSizeType(); unsigned SizeTyWidth = Context.getTypeSize(SizeTy); llvm::APInt SingleEltSize( SizeTyWidth, Context.getTypeSizeInChars(AllocType).getQuantity()); // How many bytes do we want to allocate here? std::optional AllocationSize; if (!ArraySize && !AllocType->isDependentType()) { // For non-array operator new, we only want to allocate one element. AllocationSize = SingleEltSize; } else if (KnownArraySize && !AllocType->isDependentType()) { // For array operator new, only deal with static array size case. bool Overflow; AllocationSize = llvm::APInt(SizeTyWidth, *KnownArraySize) .umul_ov(SingleEltSize, Overflow); (void)Overflow; assert( !Overflow && "Expected that all the overflows would have been handled already."); } IntegerLiteral AllocationSizeLiteral( Context, AllocationSize.value_or(llvm::APInt::getZero(SizeTyWidth)), SizeTy, SourceLocation()); // Otherwise, if we failed to constant-fold the allocation size, we'll // just give up and pass-in something opaque, that isn't a null pointer. OpaqueValueExpr OpaqueAllocationSize(SourceLocation(), SizeTy, VK_PRValue, OK_Ordinary, /*SourceExpr=*/nullptr); // Let's synthesize the alignment argument in case we will need it. // Since we *really* want to allocate these on stack, this is slightly ugly // because there might not be a `std::align_val_t` type. EnumDecl *StdAlignValT = getStdAlignValT(); QualType AlignValT = StdAlignValT ? Context.getTypeDeclType(StdAlignValT) : SizeTy; IntegerLiteral AlignmentLiteral( Context, llvm::APInt(Context.getTypeSize(SizeTy), Alignment / Context.getCharWidth()), SizeTy, SourceLocation()); ImplicitCastExpr DesiredAlignment(ImplicitCastExpr::OnStack, AlignValT, CK_IntegralCast, &AlignmentLiteral, VK_PRValue, FPOptionsOverride()); // Adjust placement args by prepending conjured size and alignment exprs. llvm::SmallVector CallArgs; CallArgs.reserve(NumImplicitArgs + PlacementArgs.size()); CallArgs.emplace_back(AllocationSize ? static_cast(&AllocationSizeLiteral) : &OpaqueAllocationSize); if (PassAlignment) CallArgs.emplace_back(&DesiredAlignment); CallArgs.insert(CallArgs.end(), PlacementArgs.begin(), PlacementArgs.end()); DiagnoseSentinelCalls(OperatorNew, PlacementLParen, CallArgs); checkCall(OperatorNew, Proto, /*ThisArg=*/nullptr, CallArgs, /*IsMemberFunction=*/false, StartLoc, Range, CallType); // Warn if the type is over-aligned and is being allocated by (unaligned) // global operator new. if (PlacementArgs.empty() && !PassAlignment && (OperatorNew->isImplicit() || (OperatorNew->getBeginLoc().isValid() && getSourceManager().isInSystemHeader(OperatorNew->getBeginLoc())))) { if (Alignment > NewAlignment) Diag(StartLoc, diag::warn_overaligned_type) << AllocType << unsigned(Alignment / Context.getCharWidth()) << unsigned(NewAlignment / Context.getCharWidth()); } } // Array 'new' can't have any initializers except empty parentheses. // Initializer lists are also allowed, in C++11. Rely on the parser for the // dialect distinction. if (ArraySize && !isLegalArrayNewInitializer(initStyle, Initializer)) { SourceRange InitRange(Exprs.front()->getBeginLoc(), Exprs.back()->getEndLoc()); Diag(StartLoc, diag::err_new_array_init_args) << InitRange; return ExprError(); } // If we can perform the initialization, and we've not already done so, // do it now. if (!AllocType->isDependentType() && !Expr::hasAnyTypeDependentArguments(Exprs)) { // The type we initialize is the complete type, including the array bound. QualType InitType; if (KnownArraySize) InitType = Context.getConstantArrayType( AllocType, llvm::APInt(Context.getTypeSize(Context.getSizeType()), *KnownArraySize), *ArraySize, ArrayType::Normal, 0); else if (ArraySize) InitType = Context.getIncompleteArrayType(AllocType, ArrayType::Normal, 0); else InitType = AllocType; InitializedEntity Entity = InitializedEntity::InitializeNew(StartLoc, InitType); InitializationSequence InitSeq(*this, Entity, Kind, Exprs); ExprResult FullInit = InitSeq.Perform(*this, Entity, Kind, Exprs); if (FullInit.isInvalid()) return ExprError(); // FullInit is our initializer; strip off CXXBindTemporaryExprs, because // we don't want the initialized object to be destructed. // FIXME: We should not create these in the first place. if (CXXBindTemporaryExpr *Binder = dyn_cast_or_null(FullInit.get())) FullInit = Binder->getSubExpr(); Initializer = FullInit.get(); // FIXME: If we have a KnownArraySize, check that the array bound of the // initializer is no greater than that constant value. if (ArraySize && !*ArraySize) { auto *CAT = Context.getAsConstantArrayType(Initializer->getType()); if (CAT) { // FIXME: Track that the array size was inferred rather than explicitly // specified. ArraySize = IntegerLiteral::Create( Context, CAT->getSize(), Context.getSizeType(), TypeRange.getEnd()); } else { Diag(TypeRange.getEnd(), diag::err_new_array_size_unknown_from_init) << Initializer->getSourceRange(); } } } // Mark the new and delete operators as referenced. if (OperatorNew) { if (DiagnoseUseOfDecl(OperatorNew, StartLoc)) return ExprError(); MarkFunctionReferenced(StartLoc, OperatorNew); } if (OperatorDelete) { if (DiagnoseUseOfDecl(OperatorDelete, StartLoc)) return ExprError(); MarkFunctionReferenced(StartLoc, OperatorDelete); } return CXXNewExpr::Create(Context, UseGlobal, OperatorNew, OperatorDelete, PassAlignment, UsualArrayDeleteWantsSize, PlacementArgs, TypeIdParens, ArraySize, initStyle, Initializer, ResultType, AllocTypeInfo, Range, DirectInitRange); } /// Checks that a type is suitable as the allocated type /// in a new-expression. bool Sema::CheckAllocatedType(QualType AllocType, SourceLocation Loc, SourceRange R) { // C++ 5.3.4p1: "[The] type shall be a complete object type, but not an // abstract class type or array thereof. if (AllocType->isFunctionType()) return Diag(Loc, diag::err_bad_new_type) << AllocType << 0 << R; else if (AllocType->isReferenceType()) return Diag(Loc, diag::err_bad_new_type) << AllocType << 1 << R; else if (!AllocType->isDependentType() && RequireCompleteSizedType( Loc, AllocType, diag::err_new_incomplete_or_sizeless_type, R)) return true; else if (RequireNonAbstractType(Loc, AllocType, diag::err_allocation_of_abstract_type)) return true; else if (AllocType->isVariablyModifiedType()) return Diag(Loc, diag::err_variably_modified_new_type) << AllocType; else if (AllocType.getAddressSpace() != LangAS::Default && !getLangOpts().OpenCLCPlusPlus) return Diag(Loc, diag::err_address_space_qualified_new) << AllocType.getUnqualifiedType() << AllocType.getQualifiers().getAddressSpaceAttributePrintValue(); else if (getLangOpts().ObjCAutoRefCount) { if (const ArrayType *AT = Context.getAsArrayType(AllocType)) { QualType BaseAllocType = Context.getBaseElementType(AT); if (BaseAllocType.getObjCLifetime() == Qualifiers::OCL_None && BaseAllocType->isObjCLifetimeType()) return Diag(Loc, diag::err_arc_new_array_without_ownership) << BaseAllocType; } } return false; } static bool resolveAllocationOverload( Sema &S, LookupResult &R, SourceRange Range, SmallVectorImpl &Args, bool &PassAlignment, FunctionDecl *&Operator, OverloadCandidateSet *AlignedCandidates, Expr *AlignArg, bool Diagnose) { OverloadCandidateSet Candidates(R.getNameLoc(), OverloadCandidateSet::CSK_Normal); for (LookupResult::iterator Alloc = R.begin(), AllocEnd = R.end(); Alloc != AllocEnd; ++Alloc) { // Even member operator new/delete are implicitly treated as // static, so don't use AddMemberCandidate. NamedDecl *D = (*Alloc)->getUnderlyingDecl(); if (FunctionTemplateDecl *FnTemplate = dyn_cast(D)) { S.AddTemplateOverloadCandidate(FnTemplate, Alloc.getPair(), /*ExplicitTemplateArgs=*/nullptr, Args, Candidates, /*SuppressUserConversions=*/false); continue; } FunctionDecl *Fn = cast(D); S.AddOverloadCandidate(Fn, Alloc.getPair(), Args, Candidates, /*SuppressUserConversions=*/false); } // Do the resolution. OverloadCandidateSet::iterator Best; switch (Candidates.BestViableFunction(S, R.getNameLoc(), Best)) { case OR_Success: { // Got one! FunctionDecl *FnDecl = Best->Function; if (S.CheckAllocationAccess(R.getNameLoc(), Range, R.getNamingClass(), Best->FoundDecl) == Sema::AR_inaccessible) return true; Operator = FnDecl; return false; } case OR_No_Viable_Function: // C++17 [expr.new]p13: // If no matching function is found and the allocated object type has // new-extended alignment, the alignment argument is removed from the // argument list, and overload resolution is performed again. if (PassAlignment) { PassAlignment = false; AlignArg = Args[1]; Args.erase(Args.begin() + 1); return resolveAllocationOverload(S, R, Range, Args, PassAlignment, Operator, &Candidates, AlignArg, Diagnose); } // MSVC will fall back on trying to find a matching global operator new // if operator new[] cannot be found. Also, MSVC will leak by not // generating a call to operator delete or operator delete[], but we // will not replicate that bug. // FIXME: Find out how this interacts with the std::align_val_t fallback // once MSVC implements it. if (R.getLookupName().getCXXOverloadedOperator() == OO_Array_New && S.Context.getLangOpts().MSVCCompat) { R.clear(); R.setLookupName(S.Context.DeclarationNames.getCXXOperatorName(OO_New)); S.LookupQualifiedName(R, S.Context.getTranslationUnitDecl()); // FIXME: This will give bad diagnostics pointing at the wrong functions. return resolveAllocationOverload(S, R, Range, Args, PassAlignment, Operator, /*Candidates=*/nullptr, /*AlignArg=*/nullptr, Diagnose); } if (Diagnose) { // If this is an allocation of the form 'new (p) X' for some object // pointer p (or an expression that will decay to such a pointer), // diagnose the missing inclusion of . if (!R.isClassLookup() && Args.size() == 2 && (Args[1]->getType()->isObjectPointerType() || Args[1]->getType()->isArrayType())) { S.Diag(R.getNameLoc(), diag::err_need_header_before_placement_new) << R.getLookupName() << Range; // Listing the candidates is unlikely to be useful; skip it. return true; } // Finish checking all candidates before we note any. This checking can // produce additional diagnostics so can't be interleaved with our // emission of notes. // // For an aligned allocation, separately check the aligned and unaligned // candidates with their respective argument lists. SmallVector Cands; SmallVector AlignedCands; llvm::SmallVector AlignedArgs; if (AlignedCandidates) { auto IsAligned = [](OverloadCandidate &C) { return C.Function->getNumParams() > 1 && C.Function->getParamDecl(1)->getType()->isAlignValT(); }; auto IsUnaligned = [&](OverloadCandidate &C) { return !IsAligned(C); }; AlignedArgs.reserve(Args.size() + 1); AlignedArgs.push_back(Args[0]); AlignedArgs.push_back(AlignArg); AlignedArgs.append(Args.begin() + 1, Args.end()); AlignedCands = AlignedCandidates->CompleteCandidates( S, OCD_AllCandidates, AlignedArgs, R.getNameLoc(), IsAligned); Cands = Candidates.CompleteCandidates(S, OCD_AllCandidates, Args, R.getNameLoc(), IsUnaligned); } else { Cands = Candidates.CompleteCandidates(S, OCD_AllCandidates, Args, R.getNameLoc()); } S.Diag(R.getNameLoc(), diag::err_ovl_no_viable_function_in_call) << R.getLookupName() << Range; if (AlignedCandidates) AlignedCandidates->NoteCandidates(S, AlignedArgs, AlignedCands, "", R.getNameLoc()); Candidates.NoteCandidates(S, Args, Cands, "", R.getNameLoc()); } return true; case OR_Ambiguous: if (Diagnose) { Candidates.NoteCandidates( PartialDiagnosticAt(R.getNameLoc(), S.PDiag(diag::err_ovl_ambiguous_call) << R.getLookupName() << Range), S, OCD_AmbiguousCandidates, Args); } return true; case OR_Deleted: { if (Diagnose) { Candidates.NoteCandidates( PartialDiagnosticAt(R.getNameLoc(), S.PDiag(diag::err_ovl_deleted_call) << R.getLookupName() << Range), S, OCD_AllCandidates, Args); } return true; } } llvm_unreachable("Unreachable, bad result from BestViableFunction"); } bool Sema::FindAllocationFunctions(SourceLocation StartLoc, SourceRange Range, AllocationFunctionScope NewScope, AllocationFunctionScope DeleteScope, QualType AllocType, bool IsArray, bool &PassAlignment, MultiExprArg PlaceArgs, FunctionDecl *&OperatorNew, FunctionDecl *&OperatorDelete, bool Diagnose) { // --- Choosing an allocation function --- // C++ 5.3.4p8 - 14 & 18 // 1) If looking in AFS_Global scope for allocation functions, only look in // the global scope. Else, if AFS_Class, only look in the scope of the // allocated class. If AFS_Both, look in both. // 2) If an array size is given, look for operator new[], else look for // operator new. // 3) The first argument is always size_t. Append the arguments from the // placement form. SmallVector AllocArgs; AllocArgs.reserve((PassAlignment ? 2 : 1) + PlaceArgs.size()); // We don't care about the actual value of these arguments. // FIXME: Should the Sema create the expression and embed it in the syntax // tree? Or should the consumer just recalculate the value? // FIXME: Using a dummy value will interact poorly with attribute enable_if. QualType SizeTy = Context.getSizeType(); unsigned SizeTyWidth = Context.getTypeSize(SizeTy); IntegerLiteral Size(Context, llvm::APInt::getZero(SizeTyWidth), SizeTy, SourceLocation()); AllocArgs.push_back(&Size); QualType AlignValT = Context.VoidTy; if (PassAlignment) { DeclareGlobalNewDelete(); AlignValT = Context.getTypeDeclType(getStdAlignValT()); } CXXScalarValueInitExpr Align(AlignValT, nullptr, SourceLocation()); if (PassAlignment) AllocArgs.push_back(&Align); AllocArgs.insert(AllocArgs.end(), PlaceArgs.begin(), PlaceArgs.end()); // C++ [expr.new]p8: // If the allocated type is a non-array type, the allocation // function's name is operator new and the deallocation function's // name is operator delete. If the allocated type is an array // type, the allocation function's name is operator new[] and the // deallocation function's name is operator delete[]. DeclarationName NewName = Context.DeclarationNames.getCXXOperatorName( IsArray ? OO_Array_New : OO_New); QualType AllocElemType = Context.getBaseElementType(AllocType); // Find the allocation function. { LookupResult R(*this, NewName, StartLoc, LookupOrdinaryName); // C++1z [expr.new]p9: // If the new-expression begins with a unary :: operator, the allocation // function's name is looked up in the global scope. Otherwise, if the // allocated type is a class type T or array thereof, the allocation // function's name is looked up in the scope of T. if (AllocElemType->isRecordType() && NewScope != AFS_Global) LookupQualifiedName(R, AllocElemType->getAsCXXRecordDecl()); // We can see ambiguity here if the allocation function is found in // multiple base classes. if (R.isAmbiguous()) return true; // If this lookup fails to find the name, or if the allocated type is not // a class type, the allocation function's name is looked up in the // global scope. if (R.empty()) { if (NewScope == AFS_Class) return true; LookupQualifiedName(R, Context.getTranslationUnitDecl()); } if (getLangOpts().OpenCLCPlusPlus && R.empty()) { if (PlaceArgs.empty()) { Diag(StartLoc, diag::err_openclcxx_not_supported) << "default new"; } else { Diag(StartLoc, diag::err_openclcxx_placement_new); } return true; } assert(!R.empty() && "implicitly declared allocation functions not found"); assert(!R.isAmbiguous() && "global allocation functions are ambiguous"); // We do our own custom access checks below. R.suppressDiagnostics(); if (resolveAllocationOverload(*this, R, Range, AllocArgs, PassAlignment, OperatorNew, /*Candidates=*/nullptr, /*AlignArg=*/nullptr, Diagnose)) return true; } // We don't need an operator delete if we're running under -fno-exceptions. if (!getLangOpts().Exceptions) { OperatorDelete = nullptr; return false; } // Note, the name of OperatorNew might have been changed from array to // non-array by resolveAllocationOverload. DeclarationName DeleteName = Context.DeclarationNames.getCXXOperatorName( OperatorNew->getDeclName().getCXXOverloadedOperator() == OO_Array_New ? OO_Array_Delete : OO_Delete); // C++ [expr.new]p19: // // If the new-expression begins with a unary :: operator, the // deallocation function's name is looked up in the global // scope. Otherwise, if the allocated type is a class type T or an // array thereof, the deallocation function's name is looked up in // the scope of T. If this lookup fails to find the name, or if // the allocated type is not a class type or array thereof, the // deallocation function's name is looked up in the global scope. LookupResult FoundDelete(*this, DeleteName, StartLoc, LookupOrdinaryName); if (AllocElemType->isRecordType() && DeleteScope != AFS_Global) { auto *RD = cast(AllocElemType->castAs()->getDecl()); LookupQualifiedName(FoundDelete, RD); } if (FoundDelete.isAmbiguous()) return true; // FIXME: clean up expressions? // Filter out any destroying operator deletes. We can't possibly call such a // function in this context, because we're handling the case where the object // was not successfully constructed. // FIXME: This is not covered by the language rules yet. { LookupResult::Filter Filter = FoundDelete.makeFilter(); while (Filter.hasNext()) { auto *FD = dyn_cast(Filter.next()->getUnderlyingDecl()); if (FD && FD->isDestroyingOperatorDelete()) Filter.erase(); } Filter.done(); } bool FoundGlobalDelete = FoundDelete.empty(); if (FoundDelete.empty()) { FoundDelete.clear(LookupOrdinaryName); if (DeleteScope == AFS_Class) return true; DeclareGlobalNewDelete(); LookupQualifiedName(FoundDelete, Context.getTranslationUnitDecl()); } FoundDelete.suppressDiagnostics(); SmallVector, 2> Matches; // Whether we're looking for a placement operator delete is dictated // by whether we selected a placement operator new, not by whether // we had explicit placement arguments. This matters for things like // struct A { void *operator new(size_t, int = 0); ... }; // A *a = new A() // // We don't have any definition for what a "placement allocation function" // is, but we assume it's any allocation function whose // parameter-declaration-clause is anything other than (size_t). // // FIXME: Should (size_t, std::align_val_t) also be considered non-placement? // This affects whether an exception from the constructor of an overaligned // type uses the sized or non-sized form of aligned operator delete. bool isPlacementNew = !PlaceArgs.empty() || OperatorNew->param_size() != 1 || OperatorNew->isVariadic(); if (isPlacementNew) { // C++ [expr.new]p20: // A declaration of a placement deallocation function matches the // declaration of a placement allocation function if it has the // same number of parameters and, after parameter transformations // (8.3.5), all parameter types except the first are // identical. [...] // // To perform this comparison, we compute the function type that // the deallocation function should have, and use that type both // for template argument deduction and for comparison purposes. QualType ExpectedFunctionType; { auto *Proto = OperatorNew->getType()->castAs(); SmallVector ArgTypes; ArgTypes.push_back(Context.VoidPtrTy); for (unsigned I = 1, N = Proto->getNumParams(); I < N; ++I) ArgTypes.push_back(Proto->getParamType(I)); FunctionProtoType::ExtProtoInfo EPI; // FIXME: This is not part of the standard's rule. EPI.Variadic = Proto->isVariadic(); ExpectedFunctionType = Context.getFunctionType(Context.VoidTy, ArgTypes, EPI); } for (LookupResult::iterator D = FoundDelete.begin(), DEnd = FoundDelete.end(); D != DEnd; ++D) { FunctionDecl *Fn = nullptr; if (FunctionTemplateDecl *FnTmpl = dyn_cast((*D)->getUnderlyingDecl())) { // Perform template argument deduction to try to match the // expected function type. TemplateDeductionInfo Info(StartLoc); if (DeduceTemplateArguments(FnTmpl, nullptr, ExpectedFunctionType, Fn, Info)) continue; } else Fn = cast((*D)->getUnderlyingDecl()); if (Context.hasSameType(adjustCCAndNoReturn(Fn->getType(), ExpectedFunctionType, /*AdjustExcpetionSpec*/true), ExpectedFunctionType)) Matches.push_back(std::make_pair(D.getPair(), Fn)); } if (getLangOpts().CUDA) EraseUnwantedCUDAMatches(getCurFunctionDecl(/*AllowLambda=*/true), Matches); } else { // C++1y [expr.new]p22: // For a non-placement allocation function, the normal deallocation // function lookup is used // // Per [expr.delete]p10, this lookup prefers a member operator delete // without a size_t argument, but prefers a non-member operator delete // with a size_t where possible (which it always is in this case). llvm::SmallVector BestDeallocFns; UsualDeallocFnInfo Selected = resolveDeallocationOverload( *this, FoundDelete, /*WantSize*/ FoundGlobalDelete, /*WantAlign*/ hasNewExtendedAlignment(*this, AllocElemType), &BestDeallocFns); if (Selected) Matches.push_back(std::make_pair(Selected.Found, Selected.FD)); else { // If we failed to select an operator, all remaining functions are viable // but ambiguous. for (auto Fn : BestDeallocFns) Matches.push_back(std::make_pair(Fn.Found, Fn.FD)); } } // C++ [expr.new]p20: // [...] If the lookup finds a single matching deallocation // function, that function will be called; otherwise, no // deallocation function will be called. if (Matches.size() == 1) { OperatorDelete = Matches[0].second; // C++1z [expr.new]p23: // If the lookup finds a usual deallocation function (3.7.4.2) // with a parameter of type std::size_t and that function, considered // as a placement deallocation function, would have been // selected as a match for the allocation function, the program // is ill-formed. if (getLangOpts().CPlusPlus11 && isPlacementNew && isNonPlacementDeallocationFunction(*this, OperatorDelete)) { UsualDeallocFnInfo Info(*this, DeclAccessPair::make(OperatorDelete, AS_public)); // Core issue, per mail to core reflector, 2016-10-09: // If this is a member operator delete, and there is a corresponding // non-sized member operator delete, this isn't /really/ a sized // deallocation function, it just happens to have a size_t parameter. bool IsSizedDelete = Info.HasSizeT; if (IsSizedDelete && !FoundGlobalDelete) { auto NonSizedDelete = resolveDeallocationOverload(*this, FoundDelete, /*WantSize*/false, /*WantAlign*/Info.HasAlignValT); if (NonSizedDelete && !NonSizedDelete.HasSizeT && NonSizedDelete.HasAlignValT == Info.HasAlignValT) IsSizedDelete = false; } if (IsSizedDelete) { SourceRange R = PlaceArgs.empty() ? SourceRange() : SourceRange(PlaceArgs.front()->getBeginLoc(), PlaceArgs.back()->getEndLoc()); Diag(StartLoc, diag::err_placement_new_non_placement_delete) << R; if (!OperatorDelete->isImplicit()) Diag(OperatorDelete->getLocation(), diag::note_previous_decl) << DeleteName; } } CheckAllocationAccess(StartLoc, Range, FoundDelete.getNamingClass(), Matches[0].first); } else if (!Matches.empty()) { // We found multiple suitable operators. Per [expr.new]p20, that means we // call no 'operator delete' function, but we should at least warn the user. // FIXME: Suppress this warning if the construction cannot throw. Diag(StartLoc, diag::warn_ambiguous_suitable_delete_function_found) << DeleteName << AllocElemType; for (auto &Match : Matches) Diag(Match.second->getLocation(), diag::note_member_declared_here) << DeleteName; } return false; } /// DeclareGlobalNewDelete - Declare the global forms of operator new and /// delete. These are: /// @code /// // C++03: /// void* operator new(std::size_t) throw(std::bad_alloc); /// void* operator new[](std::size_t) throw(std::bad_alloc); /// void operator delete(void *) throw(); /// void operator delete[](void *) throw(); /// // C++11: /// void* operator new(std::size_t); /// void* operator new[](std::size_t); /// void operator delete(void *) noexcept; /// void operator delete[](void *) noexcept; /// // C++1y: /// void* operator new(std::size_t); /// void* operator new[](std::size_t); /// void operator delete(void *) noexcept; /// void operator delete[](void *) noexcept; /// void operator delete(void *, std::size_t) noexcept; /// void operator delete[](void *, std::size_t) noexcept; /// @endcode /// Note that the placement and nothrow forms of new are *not* implicitly /// declared. Their use requires including \. void Sema::DeclareGlobalNewDelete() { if (GlobalNewDeleteDeclared) return; // The implicitly declared new and delete operators // are not supported in OpenCL. if (getLangOpts().OpenCLCPlusPlus) return; // C++ [basic.stc.dynamic.general]p2: // The library provides default definitions for the global allocation // and deallocation functions. Some global allocation and deallocation // functions are replaceable ([new.delete]); these are attached to the // global module ([module.unit]). if (getLangOpts().CPlusPlusModules && getCurrentModule()) PushGlobalModuleFragment(SourceLocation()); // C++ [basic.std.dynamic]p2: // [...] The following allocation and deallocation functions (18.4) are // implicitly declared in global scope in each translation unit of a // program // // C++03: // void* operator new(std::size_t) throw(std::bad_alloc); // void* operator new[](std::size_t) throw(std::bad_alloc); // void operator delete(void*) throw(); // void operator delete[](void*) throw(); // C++11: // void* operator new(std::size_t); // void* operator new[](std::size_t); // void operator delete(void*) noexcept; // void operator delete[](void*) noexcept; // C++1y: // void* operator new(std::size_t); // void* operator new[](std::size_t); // void operator delete(void*) noexcept; // void operator delete[](void*) noexcept; // void operator delete(void*, std::size_t) noexcept; // void operator delete[](void*, std::size_t) noexcept; // // These implicit declarations introduce only the function names operator // new, operator new[], operator delete, operator delete[]. // // Here, we need to refer to std::bad_alloc, so we will implicitly declare // "std" or "bad_alloc" as necessary to form the exception specification. // However, we do not make these implicit declarations visible to name // lookup. if (!StdBadAlloc && !getLangOpts().CPlusPlus11) { // The "std::bad_alloc" class has not yet been declared, so build it // implicitly. StdBadAlloc = CXXRecordDecl::Create(Context, TTK_Class, getOrCreateStdNamespace(), SourceLocation(), SourceLocation(), &PP.getIdentifierTable().get("bad_alloc"), nullptr); getStdBadAlloc()->setImplicit(true); // The implicitly declared "std::bad_alloc" should live in global module // fragment. if (TheGlobalModuleFragment) { getStdBadAlloc()->setModuleOwnershipKind( Decl::ModuleOwnershipKind::ReachableWhenImported); getStdBadAlloc()->setLocalOwningModule(TheGlobalModuleFragment); } } if (!StdAlignValT && getLangOpts().AlignedAllocation) { // The "std::align_val_t" enum class has not yet been declared, so build it // implicitly. auto *AlignValT = EnumDecl::Create( Context, getOrCreateStdNamespace(), SourceLocation(), SourceLocation(), &PP.getIdentifierTable().get("align_val_t"), nullptr, true, true, true); // The implicitly declared "std::align_val_t" should live in global module // fragment. if (TheGlobalModuleFragment) { AlignValT->setModuleOwnershipKind( Decl::ModuleOwnershipKind::ReachableWhenImported); AlignValT->setLocalOwningModule(TheGlobalModuleFragment); } AlignValT->setIntegerType(Context.getSizeType()); AlignValT->setPromotionType(Context.getSizeType()); AlignValT->setImplicit(true); StdAlignValT = AlignValT; } GlobalNewDeleteDeclared = true; QualType VoidPtr = Context.getPointerType(Context.VoidTy); QualType SizeT = Context.getSizeType(); auto DeclareGlobalAllocationFunctions = [&](OverloadedOperatorKind Kind, QualType Return, QualType Param) { llvm::SmallVector Params; Params.push_back(Param); // Create up to four variants of the function (sized/aligned). bool HasSizedVariant = getLangOpts().SizedDeallocation && (Kind == OO_Delete || Kind == OO_Array_Delete); bool HasAlignedVariant = getLangOpts().AlignedAllocation; int NumSizeVariants = (HasSizedVariant ? 2 : 1); int NumAlignVariants = (HasAlignedVariant ? 2 : 1); for (int Sized = 0; Sized < NumSizeVariants; ++Sized) { if (Sized) Params.push_back(SizeT); for (int Aligned = 0; Aligned < NumAlignVariants; ++Aligned) { if (Aligned) Params.push_back(Context.getTypeDeclType(getStdAlignValT())); DeclareGlobalAllocationFunction( Context.DeclarationNames.getCXXOperatorName(Kind), Return, Params); if (Aligned) Params.pop_back(); } } }; DeclareGlobalAllocationFunctions(OO_New, VoidPtr, SizeT); DeclareGlobalAllocationFunctions(OO_Array_New, VoidPtr, SizeT); DeclareGlobalAllocationFunctions(OO_Delete, Context.VoidTy, VoidPtr); DeclareGlobalAllocationFunctions(OO_Array_Delete, Context.VoidTy, VoidPtr); if (getLangOpts().CPlusPlusModules && getCurrentModule()) PopGlobalModuleFragment(); } /// DeclareGlobalAllocationFunction - Declares a single implicit global /// allocation function if it doesn't already exist. void Sema::DeclareGlobalAllocationFunction(DeclarationName Name, QualType Return, ArrayRef Params) { DeclContext *GlobalCtx = Context.getTranslationUnitDecl(); // Check if this function is already declared. DeclContext::lookup_result R = GlobalCtx->lookup(Name); for (DeclContext::lookup_iterator Alloc = R.begin(), AllocEnd = R.end(); Alloc != AllocEnd; ++Alloc) { // Only look at non-template functions, as it is the predefined, // non-templated allocation function we are trying to declare here. if (FunctionDecl *Func = dyn_cast(*Alloc)) { if (Func->getNumParams() == Params.size()) { llvm::SmallVector FuncParams; for (auto *P : Func->parameters()) FuncParams.push_back( Context.getCanonicalType(P->getType().getUnqualifiedType())); if (llvm::ArrayRef(FuncParams) == Params) { // Make the function visible to name lookup, even if we found it in // an unimported module. It either is an implicitly-declared global // allocation function, or is suppressing that function. Func->setVisibleDespiteOwningModule(); return; } } } } FunctionProtoType::ExtProtoInfo EPI(Context.getDefaultCallingConvention( /*IsVariadic=*/false, /*IsCXXMethod=*/false, /*IsBuiltin=*/true)); QualType BadAllocType; bool HasBadAllocExceptionSpec = (Name.getCXXOverloadedOperator() == OO_New || Name.getCXXOverloadedOperator() == OO_Array_New); if (HasBadAllocExceptionSpec) { if (!getLangOpts().CPlusPlus11) { BadAllocType = Context.getTypeDeclType(getStdBadAlloc()); assert(StdBadAlloc && "Must have std::bad_alloc declared"); EPI.ExceptionSpec.Type = EST_Dynamic; EPI.ExceptionSpec.Exceptions = llvm::ArrayRef(BadAllocType); } if (getLangOpts().NewInfallible) { EPI.ExceptionSpec.Type = EST_DynamicNone; } } else { EPI.ExceptionSpec = getLangOpts().CPlusPlus11 ? EST_BasicNoexcept : EST_DynamicNone; } auto CreateAllocationFunctionDecl = [&](Attr *ExtraAttr) { QualType FnType = Context.getFunctionType(Return, Params, EPI); FunctionDecl *Alloc = FunctionDecl::Create( Context, GlobalCtx, SourceLocation(), SourceLocation(), Name, FnType, /*TInfo=*/nullptr, SC_None, getCurFPFeatures().isFPConstrained(), false, true); Alloc->setImplicit(); // Global allocation functions should always be visible. Alloc->setVisibleDespiteOwningModule(); if (HasBadAllocExceptionSpec && getLangOpts().NewInfallible && !getLangOpts().CheckNew) Alloc->addAttr( ReturnsNonNullAttr::CreateImplicit(Context, Alloc->getLocation())); // C++ [basic.stc.dynamic.general]p2: // The library provides default definitions for the global allocation // and deallocation functions. Some global allocation and deallocation // functions are replaceable ([new.delete]); these are attached to the // global module ([module.unit]). // // In the language wording, these functions are attched to the global // module all the time. But in the implementation, the global module // is only meaningful when we're in a module unit. So here we attach // these allocation functions to global module conditionally. if (TheGlobalModuleFragment) { Alloc->setModuleOwnershipKind( Decl::ModuleOwnershipKind::ReachableWhenImported); Alloc->setLocalOwningModule(TheGlobalModuleFragment); } Alloc->addAttr(VisibilityAttr::CreateImplicit( Context, LangOpts.GlobalAllocationFunctionVisibilityHidden ? VisibilityAttr::Hidden : VisibilityAttr::Default)); llvm::SmallVector ParamDecls; for (QualType T : Params) { ParamDecls.push_back(ParmVarDecl::Create( Context, Alloc, SourceLocation(), SourceLocation(), nullptr, T, /*TInfo=*/nullptr, SC_None, nullptr)); ParamDecls.back()->setImplicit(); } Alloc->setParams(ParamDecls); if (ExtraAttr) Alloc->addAttr(ExtraAttr); AddKnownFunctionAttributesForReplaceableGlobalAllocationFunction(Alloc); Context.getTranslationUnitDecl()->addDecl(Alloc); IdResolver.tryAddTopLevelDecl(Alloc, Name); }; if (!LangOpts.CUDA) CreateAllocationFunctionDecl(nullptr); else { // Host and device get their own declaration so each can be // defined or re-declared independently. CreateAllocationFunctionDecl(CUDAHostAttr::CreateImplicit(Context)); CreateAllocationFunctionDecl(CUDADeviceAttr::CreateImplicit(Context)); } } FunctionDecl *Sema::FindUsualDeallocationFunction(SourceLocation StartLoc, bool CanProvideSize, bool Overaligned, DeclarationName Name) { DeclareGlobalNewDelete(); LookupResult FoundDelete(*this, Name, StartLoc, LookupOrdinaryName); LookupQualifiedName(FoundDelete, Context.getTranslationUnitDecl()); // FIXME: It's possible for this to result in ambiguity, through a // user-declared variadic operator delete or the enable_if attribute. We // should probably not consider those cases to be usual deallocation // functions. But for now we just make an arbitrary choice in that case. auto Result = resolveDeallocationOverload(*this, FoundDelete, CanProvideSize, Overaligned); assert(Result.FD && "operator delete missing from global scope?"); return Result.FD; } FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc, CXXRecordDecl *RD) { DeclarationName Name = Context.DeclarationNames.getCXXOperatorName(OO_Delete); FunctionDecl *OperatorDelete = nullptr; if (FindDeallocationFunction(Loc, RD, Name, OperatorDelete)) return nullptr; if (OperatorDelete) return OperatorDelete; // If there's no class-specific operator delete, look up the global // non-array delete. return FindUsualDeallocationFunction( Loc, true, hasNewExtendedAlignment(*this, Context.getRecordType(RD)), Name); } bool Sema::FindDeallocationFunction(SourceLocation StartLoc, CXXRecordDecl *RD, DeclarationName Name, FunctionDecl *&Operator, bool Diagnose, bool WantSize, bool WantAligned) { LookupResult Found(*this, Name, StartLoc, LookupOrdinaryName); // Try to find operator delete/operator delete[] in class scope. LookupQualifiedName(Found, RD); if (Found.isAmbiguous()) return true; Found.suppressDiagnostics(); bool Overaligned = WantAligned || hasNewExtendedAlignment(*this, Context.getRecordType(RD)); // C++17 [expr.delete]p10: // If the deallocation functions have class scope, the one without a // parameter of type std::size_t is selected. llvm::SmallVector Matches; resolveDeallocationOverload(*this, Found, /*WantSize*/ WantSize, /*WantAlign*/ Overaligned, &Matches); // If we could find an overload, use it. if (Matches.size() == 1) { Operator = cast(Matches[0].FD); // FIXME: DiagnoseUseOfDecl? if (Operator->isDeleted()) { if (Diagnose) { Diag(StartLoc, diag::err_deleted_function_use); NoteDeletedFunction(Operator); } return true; } if (CheckAllocationAccess(StartLoc, SourceRange(), Found.getNamingClass(), Matches[0].Found, Diagnose) == AR_inaccessible) return true; return false; } // We found multiple suitable operators; complain about the ambiguity. // FIXME: The standard doesn't say to do this; it appears that the intent // is that this should never happen. if (!Matches.empty()) { if (Diagnose) { Diag(StartLoc, diag::err_ambiguous_suitable_delete_member_function_found) << Name << RD; for (auto &Match : Matches) Diag(Match.FD->getLocation(), diag::note_member_declared_here) << Name; } return true; } // We did find operator delete/operator delete[] declarations, but // none of them were suitable. if (!Found.empty()) { if (Diagnose) { Diag(StartLoc, diag::err_no_suitable_delete_member_function_found) << Name << RD; for (NamedDecl *D : Found) Diag(D->getUnderlyingDecl()->getLocation(), diag::note_member_declared_here) << Name; } return true; } Operator = nullptr; return false; } namespace { /// Checks whether delete-expression, and new-expression used for /// initializing deletee have the same array form. class MismatchingNewDeleteDetector { public: enum MismatchResult { /// Indicates that there is no mismatch or a mismatch cannot be proven. NoMismatch, /// Indicates that variable is initialized with mismatching form of \a new. VarInitMismatches, /// Indicates that member is initialized with mismatching form of \a new. MemberInitMismatches, /// Indicates that 1 or more constructors' definitions could not been /// analyzed, and they will be checked again at the end of translation unit. AnalyzeLater }; /// \param EndOfTU True, if this is the final analysis at the end of /// translation unit. False, if this is the initial analysis at the point /// delete-expression was encountered. explicit MismatchingNewDeleteDetector(bool EndOfTU) : Field(nullptr), IsArrayForm(false), EndOfTU(EndOfTU), HasUndefinedConstructors(false) {} /// Checks whether pointee of a delete-expression is initialized with /// matching form of new-expression. /// /// If return value is \c VarInitMismatches or \c MemberInitMismatches at the /// point where delete-expression is encountered, then a warning will be /// issued immediately. If return value is \c AnalyzeLater at the point where /// delete-expression is seen, then member will be analyzed at the end of /// translation unit. \c AnalyzeLater is returned iff at least one constructor /// couldn't be analyzed. If at least one constructor initializes the member /// with matching type of new, the return value is \c NoMismatch. MismatchResult analyzeDeleteExpr(const CXXDeleteExpr *DE); /// Analyzes a class member. /// \param Field Class member to analyze. /// \param DeleteWasArrayForm Array form-ness of the delete-expression used /// for deleting the \p Field. MismatchResult analyzeField(FieldDecl *Field, bool DeleteWasArrayForm); FieldDecl *Field; /// List of mismatching new-expressions used for initialization of the pointee llvm::SmallVector NewExprs; /// Indicates whether delete-expression was in array form. bool IsArrayForm; private: const bool EndOfTU; /// Indicates that there is at least one constructor without body. bool HasUndefinedConstructors; /// Returns \c CXXNewExpr from given initialization expression. /// \param E Expression used for initializing pointee in delete-expression. /// E can be a single-element \c InitListExpr consisting of new-expression. const CXXNewExpr *getNewExprFromInitListOrExpr(const Expr *E); /// Returns whether member is initialized with mismatching form of /// \c new either by the member initializer or in-class initialization. /// /// If bodies of all constructors are not visible at the end of translation /// unit or at least one constructor initializes member with the matching /// form of \c new, mismatch cannot be proven, and this function will return /// \c NoMismatch. MismatchResult analyzeMemberExpr(const MemberExpr *ME); /// Returns whether variable is initialized with mismatching form of /// \c new. /// /// If variable is initialized with matching form of \c new or variable is not /// initialized with a \c new expression, this function will return true. /// If variable is initialized with mismatching form of \c new, returns false. /// \param D Variable to analyze. bool hasMatchingVarInit(const DeclRefExpr *D); /// Checks whether the constructor initializes pointee with mismatching /// form of \c new. /// /// Returns true, if member is initialized with matching form of \c new in /// member initializer list. Returns false, if member is initialized with the /// matching form of \c new in this constructor's initializer or given /// constructor isn't defined at the point where delete-expression is seen, or /// member isn't initialized by the constructor. bool hasMatchingNewInCtor(const CXXConstructorDecl *CD); /// Checks whether member is initialized with matching form of /// \c new in member initializer list. bool hasMatchingNewInCtorInit(const CXXCtorInitializer *CI); /// Checks whether member is initialized with mismatching form of \c new by /// in-class initializer. MismatchResult analyzeInClassInitializer(); }; } MismatchingNewDeleteDetector::MismatchResult MismatchingNewDeleteDetector::analyzeDeleteExpr(const CXXDeleteExpr *DE) { NewExprs.clear(); assert(DE && "Expected delete-expression"); IsArrayForm = DE->isArrayForm(); const Expr *E = DE->getArgument()->IgnoreParenImpCasts(); if (const MemberExpr *ME = dyn_cast(E)) { return analyzeMemberExpr(ME); } else if (const DeclRefExpr *D = dyn_cast(E)) { if (!hasMatchingVarInit(D)) return VarInitMismatches; } return NoMismatch; } const CXXNewExpr * MismatchingNewDeleteDetector::getNewExprFromInitListOrExpr(const Expr *E) { assert(E != nullptr && "Expected a valid initializer expression"); E = E->IgnoreParenImpCasts(); if (const InitListExpr *ILE = dyn_cast(E)) { if (ILE->getNumInits() == 1) E = dyn_cast(ILE->getInit(0)->IgnoreParenImpCasts()); } return dyn_cast_or_null(E); } bool MismatchingNewDeleteDetector::hasMatchingNewInCtorInit( const CXXCtorInitializer *CI) { const CXXNewExpr *NE = nullptr; if (Field == CI->getMember() && (NE = getNewExprFromInitListOrExpr(CI->getInit()))) { if (NE->isArray() == IsArrayForm) return true; else NewExprs.push_back(NE); } return false; } bool MismatchingNewDeleteDetector::hasMatchingNewInCtor( const CXXConstructorDecl *CD) { if (CD->isImplicit()) return false; const FunctionDecl *Definition = CD; if (!CD->isThisDeclarationADefinition() && !CD->isDefined(Definition)) { HasUndefinedConstructors = true; return EndOfTU; } for (const auto *CI : cast(Definition)->inits()) { if (hasMatchingNewInCtorInit(CI)) return true; } return false; } MismatchingNewDeleteDetector::MismatchResult MismatchingNewDeleteDetector::analyzeInClassInitializer() { assert(Field != nullptr && "This should be called only for members"); const Expr *InitExpr = Field->getInClassInitializer(); if (!InitExpr) return EndOfTU ? NoMismatch : AnalyzeLater; if (const CXXNewExpr *NE = getNewExprFromInitListOrExpr(InitExpr)) { if (NE->isArray() != IsArrayForm) { NewExprs.push_back(NE); return MemberInitMismatches; } } return NoMismatch; } MismatchingNewDeleteDetector::MismatchResult MismatchingNewDeleteDetector::analyzeField(FieldDecl *Field, bool DeleteWasArrayForm) { assert(Field != nullptr && "Analysis requires a valid class member."); this->Field = Field; IsArrayForm = DeleteWasArrayForm; const CXXRecordDecl *RD = cast(Field->getParent()); for (const auto *CD : RD->ctors()) { if (hasMatchingNewInCtor(CD)) return NoMismatch; } if (HasUndefinedConstructors) return EndOfTU ? NoMismatch : AnalyzeLater; if (!NewExprs.empty()) return MemberInitMismatches; return Field->hasInClassInitializer() ? analyzeInClassInitializer() : NoMismatch; } MismatchingNewDeleteDetector::MismatchResult MismatchingNewDeleteDetector::analyzeMemberExpr(const MemberExpr *ME) { assert(ME != nullptr && "Expected a member expression"); if (FieldDecl *F = dyn_cast(ME->getMemberDecl())) return analyzeField(F, IsArrayForm); return NoMismatch; } bool MismatchingNewDeleteDetector::hasMatchingVarInit(const DeclRefExpr *D) { const CXXNewExpr *NE = nullptr; if (const VarDecl *VD = dyn_cast(D->getDecl())) { if (VD->hasInit() && (NE = getNewExprFromInitListOrExpr(VD->getInit())) && NE->isArray() != IsArrayForm) { NewExprs.push_back(NE); } } return NewExprs.empty(); } static void DiagnoseMismatchedNewDelete(Sema &SemaRef, SourceLocation DeleteLoc, const MismatchingNewDeleteDetector &Detector) { SourceLocation EndOfDelete = SemaRef.getLocForEndOfToken(DeleteLoc); FixItHint H; if (!Detector.IsArrayForm) H = FixItHint::CreateInsertion(EndOfDelete, "[]"); else { SourceLocation RSquare = Lexer::findLocationAfterToken( DeleteLoc, tok::l_square, SemaRef.getSourceManager(), SemaRef.getLangOpts(), true); if (RSquare.isValid()) H = FixItHint::CreateRemoval(SourceRange(EndOfDelete, RSquare)); } SemaRef.Diag(DeleteLoc, diag::warn_mismatched_delete_new) << Detector.IsArrayForm << H; for (const auto *NE : Detector.NewExprs) SemaRef.Diag(NE->getExprLoc(), diag::note_allocated_here) << Detector.IsArrayForm; } void Sema::AnalyzeDeleteExprMismatch(const CXXDeleteExpr *DE) { if (Diags.isIgnored(diag::warn_mismatched_delete_new, SourceLocation())) return; MismatchingNewDeleteDetector Detector(/*EndOfTU=*/false); switch (Detector.analyzeDeleteExpr(DE)) { case MismatchingNewDeleteDetector::VarInitMismatches: case MismatchingNewDeleteDetector::MemberInitMismatches: { DiagnoseMismatchedNewDelete(*this, DE->getBeginLoc(), Detector); break; } case MismatchingNewDeleteDetector::AnalyzeLater: { DeleteExprs[Detector.Field].push_back( std::make_pair(DE->getBeginLoc(), DE->isArrayForm())); break; } case MismatchingNewDeleteDetector::NoMismatch: break; } } void Sema::AnalyzeDeleteExprMismatch(FieldDecl *Field, SourceLocation DeleteLoc, bool DeleteWasArrayForm) { MismatchingNewDeleteDetector Detector(/*EndOfTU=*/true); switch (Detector.analyzeField(Field, DeleteWasArrayForm)) { case MismatchingNewDeleteDetector::VarInitMismatches: llvm_unreachable("This analysis should have been done for class members."); case MismatchingNewDeleteDetector::AnalyzeLater: llvm_unreachable("Analysis cannot be postponed any point beyond end of " "translation unit."); case MismatchingNewDeleteDetector::MemberInitMismatches: DiagnoseMismatchedNewDelete(*this, DeleteLoc, Detector); break; case MismatchingNewDeleteDetector::NoMismatch: break; } } /// ActOnCXXDelete - Parsed a C++ 'delete' expression (C++ 5.3.5), as in: /// @code ::delete ptr; @endcode /// or /// @code delete [] ptr; @endcode ExprResult Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal, bool ArrayForm, Expr *ExE) { // C++ [expr.delete]p1: // The operand shall have a pointer type, or a class type having a single // non-explicit conversion function to a pointer type. The result has type // void. // // DR599 amends "pointer type" to "pointer to object type" in both cases. ExprResult Ex = ExE; FunctionDecl *OperatorDelete = nullptr; bool ArrayFormAsWritten = ArrayForm; bool UsualArrayDeleteWantsSize = false; if (!Ex.get()->isTypeDependent()) { // Perform lvalue-to-rvalue cast, if needed. Ex = DefaultLvalueConversion(Ex.get()); if (Ex.isInvalid()) return ExprError(); QualType Type = Ex.get()->getType(); class DeleteConverter : public ContextualImplicitConverter { public: DeleteConverter() : ContextualImplicitConverter(false, true) {} bool match(QualType ConvType) override { // FIXME: If we have an operator T* and an operator void*, we must pick // the operator T*. if (const PointerType *ConvPtrType = ConvType->getAs()) if (ConvPtrType->getPointeeType()->isIncompleteOrObjectType()) return true; return false; } SemaDiagnosticBuilder diagnoseNoMatch(Sema &S, SourceLocation Loc, QualType T) override { return S.Diag(Loc, diag::err_delete_operand) << T; } SemaDiagnosticBuilder diagnoseIncomplete(Sema &S, SourceLocation Loc, QualType T) override { return S.Diag(Loc, diag::err_delete_incomplete_class_type) << T; } SemaDiagnosticBuilder diagnoseExplicitConv(Sema &S, SourceLocation Loc, QualType T, QualType ConvTy) override { return S.Diag(Loc, diag::err_delete_explicit_conversion) << T << ConvTy; } SemaDiagnosticBuilder noteExplicitConv(Sema &S, CXXConversionDecl *Conv, QualType ConvTy) override { return S.Diag(Conv->getLocation(), diag::note_delete_conversion) << ConvTy; } SemaDiagnosticBuilder diagnoseAmbiguous(Sema &S, SourceLocation Loc, QualType T) override { return S.Diag(Loc, diag::err_ambiguous_delete_operand) << T; } SemaDiagnosticBuilder noteAmbiguous(Sema &S, CXXConversionDecl *Conv, QualType ConvTy) override { return S.Diag(Conv->getLocation(), diag::note_delete_conversion) << ConvTy; } SemaDiagnosticBuilder diagnoseConversion(Sema &S, SourceLocation Loc, QualType T, QualType ConvTy) override { llvm_unreachable("conversion functions are permitted"); } } Converter; Ex = PerformContextualImplicitConversion(StartLoc, Ex.get(), Converter); if (Ex.isInvalid()) return ExprError(); Type = Ex.get()->getType(); if (!Converter.match(Type)) // FIXME: PerformContextualImplicitConversion should return ExprError // itself in this case. return ExprError(); QualType Pointee = Type->castAs()->getPointeeType(); QualType PointeeElem = Context.getBaseElementType(Pointee); if (Pointee.getAddressSpace() != LangAS::Default && !getLangOpts().OpenCLCPlusPlus) return Diag(Ex.get()->getBeginLoc(), diag::err_address_space_qualified_delete) << Pointee.getUnqualifiedType() << Pointee.getQualifiers().getAddressSpaceAttributePrintValue(); CXXRecordDecl *PointeeRD = nullptr; if (Pointee->isVoidType() && !isSFINAEContext()) { // The C++ standard bans deleting a pointer to a non-object type, which // effectively bans deletion of "void*". However, most compilers support // this, so we treat it as a warning unless we're in a SFINAE context. Diag(StartLoc, diag::ext_delete_void_ptr_operand) << Type << Ex.get()->getSourceRange(); } else if (Pointee->isFunctionType() || Pointee->isVoidType() || Pointee->isSizelessType()) { return ExprError(Diag(StartLoc, diag::err_delete_operand) << Type << Ex.get()->getSourceRange()); } else if (!Pointee->isDependentType()) { // FIXME: This can result in errors if the definition was imported from a // module but is hidden. if (!RequireCompleteType(StartLoc, Pointee, diag::warn_delete_incomplete, Ex.get())) { if (const RecordType *RT = PointeeElem->getAs()) PointeeRD = cast(RT->getDecl()); } } if (Pointee->isArrayType() && !ArrayForm) { Diag(StartLoc, diag::warn_delete_array_type) << Type << Ex.get()->getSourceRange() << FixItHint::CreateInsertion(getLocForEndOfToken(StartLoc), "[]"); ArrayForm = true; } DeclarationName DeleteName = Context.DeclarationNames.getCXXOperatorName( ArrayForm ? OO_Array_Delete : OO_Delete); if (PointeeRD) { if (!UseGlobal && FindDeallocationFunction(StartLoc, PointeeRD, DeleteName, OperatorDelete)) return ExprError(); // If we're allocating an array of records, check whether the // usual operator delete[] has a size_t parameter. if (ArrayForm) { // If the user specifically asked to use the global allocator, // we'll need to do the lookup into the class. if (UseGlobal) UsualArrayDeleteWantsSize = doesUsualArrayDeleteWantSize(*this, StartLoc, PointeeElem); // Otherwise, the usual operator delete[] should be the // function we just found. else if (OperatorDelete && isa(OperatorDelete)) UsualArrayDeleteWantsSize = UsualDeallocFnInfo(*this, DeclAccessPair::make(OperatorDelete, AS_public)) .HasSizeT; } if (!PointeeRD->hasIrrelevantDestructor()) if (CXXDestructorDecl *Dtor = LookupDestructor(PointeeRD)) { MarkFunctionReferenced(StartLoc, const_cast(Dtor)); if (DiagnoseUseOfDecl(Dtor, StartLoc)) return ExprError(); } CheckVirtualDtorCall(PointeeRD->getDestructor(), StartLoc, /*IsDelete=*/true, /*CallCanBeVirtual=*/true, /*WarnOnNonAbstractTypes=*/!ArrayForm, SourceLocation()); } if (!OperatorDelete) { if (getLangOpts().OpenCLCPlusPlus) { Diag(StartLoc, diag::err_openclcxx_not_supported) << "default delete"; return ExprError(); } bool IsComplete = isCompleteType(StartLoc, Pointee); bool CanProvideSize = IsComplete && (!ArrayForm || UsualArrayDeleteWantsSize || Pointee.isDestructedType()); bool Overaligned = hasNewExtendedAlignment(*this, Pointee); // Look for a global declaration. OperatorDelete = FindUsualDeallocationFunction(StartLoc, CanProvideSize, Overaligned, DeleteName); } MarkFunctionReferenced(StartLoc, OperatorDelete); // Check access and ambiguity of destructor if we're going to call it. // Note that this is required even for a virtual delete. bool IsVirtualDelete = false; if (PointeeRD) { if (CXXDestructorDecl *Dtor = LookupDestructor(PointeeRD)) { CheckDestructorAccess(Ex.get()->getExprLoc(), Dtor, PDiag(diag::err_access_dtor) << PointeeElem); IsVirtualDelete = Dtor->isVirtual(); } } DiagnoseUseOfDecl(OperatorDelete, StartLoc); // Convert the operand to the type of the first parameter of operator // delete. This is only necessary if we selected a destroying operator // delete that we are going to call (non-virtually); converting to void* // is trivial and left to AST consumers to handle. QualType ParamType = OperatorDelete->getParamDecl(0)->getType(); if (!IsVirtualDelete && !ParamType->getPointeeType()->isVoidType()) { Qualifiers Qs = Pointee.getQualifiers(); if (Qs.hasCVRQualifiers()) { // Qualifiers are irrelevant to this conversion; we're only looking // for access and ambiguity. Qs.removeCVRQualifiers(); QualType Unqual = Context.getPointerType( Context.getQualifiedType(Pointee.getUnqualifiedType(), Qs)); Ex = ImpCastExprToType(Ex.get(), Unqual, CK_NoOp); } Ex = PerformImplicitConversion(Ex.get(), ParamType, AA_Passing); if (Ex.isInvalid()) return ExprError(); } } CXXDeleteExpr *Result = new (Context) CXXDeleteExpr( Context.VoidTy, UseGlobal, ArrayForm, ArrayFormAsWritten, UsualArrayDeleteWantsSize, OperatorDelete, Ex.get(), StartLoc); AnalyzeDeleteExprMismatch(Result); return Result; } static bool resolveBuiltinNewDeleteOverload(Sema &S, CallExpr *TheCall, bool IsDelete, FunctionDecl *&Operator) { DeclarationName NewName = S.Context.DeclarationNames.getCXXOperatorName( IsDelete ? OO_Delete : OO_New); LookupResult R(S, NewName, TheCall->getBeginLoc(), Sema::LookupOrdinaryName); S.LookupQualifiedName(R, S.Context.getTranslationUnitDecl()); assert(!R.empty() && "implicitly declared allocation functions not found"); assert(!R.isAmbiguous() && "global allocation functions are ambiguous"); // We do our own custom access checks below. R.suppressDiagnostics(); SmallVector Args(TheCall->arguments()); OverloadCandidateSet Candidates(R.getNameLoc(), OverloadCandidateSet::CSK_Normal); for (LookupResult::iterator FnOvl = R.begin(), FnOvlEnd = R.end(); FnOvl != FnOvlEnd; ++FnOvl) { // Even member operator new/delete are implicitly treated as // static, so don't use AddMemberCandidate. NamedDecl *D = (*FnOvl)->getUnderlyingDecl(); if (FunctionTemplateDecl *FnTemplate = dyn_cast(D)) { S.AddTemplateOverloadCandidate(FnTemplate, FnOvl.getPair(), /*ExplicitTemplateArgs=*/nullptr, Args, Candidates, /*SuppressUserConversions=*/false); continue; } FunctionDecl *Fn = cast(D); S.AddOverloadCandidate(Fn, FnOvl.getPair(), Args, Candidates, /*SuppressUserConversions=*/false); } SourceRange Range = TheCall->getSourceRange(); // Do the resolution. OverloadCandidateSet::iterator Best; switch (Candidates.BestViableFunction(S, R.getNameLoc(), Best)) { case OR_Success: { // Got one! FunctionDecl *FnDecl = Best->Function; assert(R.getNamingClass() == nullptr && "class members should not be considered"); if (!FnDecl->isReplaceableGlobalAllocationFunction()) { S.Diag(R.getNameLoc(), diag::err_builtin_operator_new_delete_not_usual) << (IsDelete ? 1 : 0) << Range; S.Diag(FnDecl->getLocation(), diag::note_non_usual_function_declared_here) << R.getLookupName() << FnDecl->getSourceRange(); return true; } Operator = FnDecl; return false; } case OR_No_Viable_Function: Candidates.NoteCandidates( PartialDiagnosticAt(R.getNameLoc(), S.PDiag(diag::err_ovl_no_viable_function_in_call) << R.getLookupName() << Range), S, OCD_AllCandidates, Args); return true; case OR_Ambiguous: Candidates.NoteCandidates( PartialDiagnosticAt(R.getNameLoc(), S.PDiag(diag::err_ovl_ambiguous_call) << R.getLookupName() << Range), S, OCD_AmbiguousCandidates, Args); return true; case OR_Deleted: { Candidates.NoteCandidates( PartialDiagnosticAt(R.getNameLoc(), S.PDiag(diag::err_ovl_deleted_call) << R.getLookupName() << Range), S, OCD_AllCandidates, Args); return true; } } llvm_unreachable("Unreachable, bad result from BestViableFunction"); } ExprResult Sema::SemaBuiltinOperatorNewDeleteOverloaded(ExprResult TheCallResult, bool IsDelete) { CallExpr *TheCall = cast(TheCallResult.get()); if (!getLangOpts().CPlusPlus) { Diag(TheCall->getExprLoc(), diag::err_builtin_requires_language) << (IsDelete ? "__builtin_operator_delete" : "__builtin_operator_new") << "C++"; return ExprError(); } // CodeGen assumes it can find the global new and delete to call, // so ensure that they are declared. DeclareGlobalNewDelete(); FunctionDecl *OperatorNewOrDelete = nullptr; if (resolveBuiltinNewDeleteOverload(*this, TheCall, IsDelete, OperatorNewOrDelete)) return ExprError(); assert(OperatorNewOrDelete && "should be found"); DiagnoseUseOfDecl(OperatorNewOrDelete, TheCall->getExprLoc()); MarkFunctionReferenced(TheCall->getExprLoc(), OperatorNewOrDelete); TheCall->setType(OperatorNewOrDelete->getReturnType()); for (unsigned i = 0; i != TheCall->getNumArgs(); ++i) { QualType ParamTy = OperatorNewOrDelete->getParamDecl(i)->getType(); InitializedEntity Entity = InitializedEntity::InitializeParameter(Context, ParamTy, false); ExprResult Arg = PerformCopyInitialization( Entity, TheCall->getArg(i)->getBeginLoc(), TheCall->getArg(i)); if (Arg.isInvalid()) return ExprError(); TheCall->setArg(i, Arg.get()); } auto Callee = dyn_cast(TheCall->getCallee()); assert(Callee && Callee->getCastKind() == CK_BuiltinFnToFnPtr && "Callee expected to be implicit cast to a builtin function pointer"); Callee->setType(OperatorNewOrDelete->getType()); return TheCallResult; } void Sema::CheckVirtualDtorCall(CXXDestructorDecl *dtor, SourceLocation Loc, bool IsDelete, bool CallCanBeVirtual, bool WarnOnNonAbstractTypes, SourceLocation DtorLoc) { if (!dtor || dtor->isVirtual() || !CallCanBeVirtual || isUnevaluatedContext()) return; // C++ [expr.delete]p3: // In the first alternative (delete object), if the static type of the // object to be deleted is different from its dynamic type, the static // type shall be a base class of the dynamic type of the object to be // deleted and the static type shall have a virtual destructor or the // behavior is undefined. // const CXXRecordDecl *PointeeRD = dtor->getParent(); // Note: a final class cannot be derived from, no issue there if (!PointeeRD->isPolymorphic() || PointeeRD->hasAttr()) return; // If the superclass is in a system header, there's nothing that can be done. // The `delete` (where we emit the warning) can be in a system header, // what matters for this warning is where the deleted type is defined. if (getSourceManager().isInSystemHeader(PointeeRD->getLocation())) return; QualType ClassType = dtor->getThisType()->getPointeeType(); if (PointeeRD->isAbstract()) { // If the class is abstract, we warn by default, because we're // sure the code has undefined behavior. Diag(Loc, diag::warn_delete_abstract_non_virtual_dtor) << (IsDelete ? 0 : 1) << ClassType; } else if (WarnOnNonAbstractTypes) { // Otherwise, if this is not an array delete, it's a bit suspect, // but not necessarily wrong. Diag(Loc, diag::warn_delete_non_virtual_dtor) << (IsDelete ? 0 : 1) << ClassType; } if (!IsDelete) { std::string TypeStr; ClassType.getAsStringInternal(TypeStr, getPrintingPolicy()); Diag(DtorLoc, diag::note_delete_non_virtual) << FixItHint::CreateInsertion(DtorLoc, TypeStr + "::"); } } Sema::ConditionResult Sema::ActOnConditionVariable(Decl *ConditionVar, SourceLocation StmtLoc, ConditionKind CK) { ExprResult E = CheckConditionVariable(cast(ConditionVar), StmtLoc, CK); if (E.isInvalid()) return ConditionError(); return ConditionResult(*this, ConditionVar, MakeFullExpr(E.get(), StmtLoc), CK == ConditionKind::ConstexprIf); } /// Check the use of the given variable as a C++ condition in an if, /// while, do-while, or switch statement. ExprResult Sema::CheckConditionVariable(VarDecl *ConditionVar, SourceLocation StmtLoc, ConditionKind CK) { if (ConditionVar->isInvalidDecl()) return ExprError(); QualType T = ConditionVar->getType(); // C++ [stmt.select]p2: // The declarator shall not specify a function or an array. if (T->isFunctionType()) return ExprError(Diag(ConditionVar->getLocation(), diag::err_invalid_use_of_function_type) << ConditionVar->getSourceRange()); else if (T->isArrayType()) return ExprError(Diag(ConditionVar->getLocation(), diag::err_invalid_use_of_array_type) << ConditionVar->getSourceRange()); ExprResult Condition = BuildDeclRefExpr( ConditionVar, ConditionVar->getType().getNonReferenceType(), VK_LValue, ConditionVar->getLocation()); switch (CK) { case ConditionKind::Boolean: return CheckBooleanCondition(StmtLoc, Condition.get()); case ConditionKind::ConstexprIf: return CheckBooleanCondition(StmtLoc, Condition.get(), true); case ConditionKind::Switch: return CheckSwitchCondition(StmtLoc, Condition.get()); } llvm_unreachable("unexpected condition kind"); } /// CheckCXXBooleanCondition - Returns true if a conversion to bool is invalid. ExprResult Sema::CheckCXXBooleanCondition(Expr *CondExpr, bool IsConstexpr) { // C++11 6.4p4: // The value of a condition that is an initialized declaration in a statement // other than a switch statement is the value of the declared variable // implicitly converted to type bool. If that conversion is ill-formed, the // program is ill-formed. // The value of a condition that is an expression is the value of the // expression, implicitly converted to bool. // // C++23 8.5.2p2 // If the if statement is of the form if constexpr, the value of the condition // is contextually converted to bool and the converted expression shall be // a constant expression. // ExprResult E = PerformContextuallyConvertToBool(CondExpr); if (!IsConstexpr || E.isInvalid() || E.get()->isValueDependent()) return E; // FIXME: Return this value to the caller so they don't need to recompute it. llvm::APSInt Cond; E = VerifyIntegerConstantExpression( E.get(), &Cond, diag::err_constexpr_if_condition_expression_is_not_constant); return E; } /// Helper function to determine whether this is the (deprecated) C++ /// conversion from a string literal to a pointer to non-const char or /// non-const wchar_t (for narrow and wide string literals, /// respectively). bool Sema::IsStringLiteralToNonConstPointerConversion(Expr *From, QualType ToType) { // Look inside the implicit cast, if it exists. if (ImplicitCastExpr *Cast = dyn_cast(From)) From = Cast->getSubExpr(); // A string literal (2.13.4) that is not a wide string literal can // be converted to an rvalue of type "pointer to char"; a wide // string literal can be converted to an rvalue of type "pointer // to wchar_t" (C++ 4.2p2). if (StringLiteral *StrLit = dyn_cast(From->IgnoreParens())) if (const PointerType *ToPtrType = ToType->getAs()) if (const BuiltinType *ToPointeeType = ToPtrType->getPointeeType()->getAs()) { // This conversion is considered only when there is an // explicit appropriate pointer target type (C++ 4.2p2). if (!ToPtrType->getPointeeType().hasQualifiers()) { switch (StrLit->getKind()) { case StringLiteral::UTF8: case StringLiteral::UTF16: case StringLiteral::UTF32: // We don't allow UTF literals to be implicitly converted break; case StringLiteral::Ordinary: return (ToPointeeType->getKind() == BuiltinType::Char_U || ToPointeeType->getKind() == BuiltinType::Char_S); case StringLiteral::Wide: return Context.typesAreCompatible(Context.getWideCharType(), QualType(ToPointeeType, 0)); case StringLiteral::Unevaluated: assert(false && "Unevaluated string literal in expression"); break; } } } return false; } static ExprResult BuildCXXCastArgument(Sema &S, SourceLocation CastLoc, QualType Ty, CastKind Kind, CXXMethodDecl *Method, DeclAccessPair FoundDecl, bool HadMultipleCandidates, Expr *From) { switch (Kind) { default: llvm_unreachable("Unhandled cast kind!"); case CK_ConstructorConversion: { CXXConstructorDecl *Constructor = cast(Method); SmallVector ConstructorArgs; if (S.RequireNonAbstractType(CastLoc, Ty, diag::err_allocation_of_abstract_type)) return ExprError(); if (S.CompleteConstructorCall(Constructor, Ty, From, CastLoc, ConstructorArgs)) return ExprError(); S.CheckConstructorAccess(CastLoc, Constructor, FoundDecl, InitializedEntity::InitializeTemporary(Ty)); if (S.DiagnoseUseOfDecl(Method, CastLoc)) return ExprError(); ExprResult Result = S.BuildCXXConstructExpr( CastLoc, Ty, FoundDecl, cast(Method), ConstructorArgs, HadMultipleCandidates, /*ListInit*/ false, /*StdInitListInit*/ false, /*ZeroInit*/ false, CXXConstructExpr::CK_Complete, SourceRange()); if (Result.isInvalid()) return ExprError(); return S.MaybeBindToTemporary(Result.getAs()); } case CK_UserDefinedConversion: { assert(!From->getType()->isPointerType() && "Arg can't have pointer type!"); S.CheckMemberOperatorAccess(CastLoc, From, /*arg*/ nullptr, FoundDecl); if (S.DiagnoseUseOfDecl(Method, CastLoc)) return ExprError(); // Create an implicit call expr that calls it. CXXConversionDecl *Conv = cast(Method); ExprResult Result = S.BuildCXXMemberCallExpr(From, FoundDecl, Conv, HadMultipleCandidates); if (Result.isInvalid()) return ExprError(); // Record usage of conversion in an implicit cast. Result = ImplicitCastExpr::Create(S.Context, Result.get()->getType(), CK_UserDefinedConversion, Result.get(), nullptr, Result.get()->getValueKind(), S.CurFPFeatureOverrides()); return S.MaybeBindToTemporary(Result.get()); } } } /// PerformImplicitConversion - Perform an implicit conversion of the /// expression From to the type ToType using the pre-computed implicit /// conversion sequence ICS. Returns the converted /// expression. Action is the kind of conversion we're performing, /// used in the error message. ExprResult Sema::PerformImplicitConversion(Expr *From, QualType ToType, const ImplicitConversionSequence &ICS, AssignmentAction Action, CheckedConversionKind CCK) { // C++ [over.match.oper]p7: [...] operands of class type are converted [...] if (CCK == CCK_ForBuiltinOverloadedOp && !From->getType()->isRecordType()) return From; switch (ICS.getKind()) { case ImplicitConversionSequence::StandardConversion: { ExprResult Res = PerformImplicitConversion(From, ToType, ICS.Standard, Action, CCK); if (Res.isInvalid()) return ExprError(); From = Res.get(); break; } case ImplicitConversionSequence::UserDefinedConversion: { FunctionDecl *FD = ICS.UserDefined.ConversionFunction; CastKind CastKind; QualType BeforeToType; assert(FD && "no conversion function for user-defined conversion seq"); if (const CXXConversionDecl *Conv = dyn_cast(FD)) { CastKind = CK_UserDefinedConversion; // If the user-defined conversion is specified by a conversion function, // the initial standard conversion sequence converts the source type to // the implicit object parameter of the conversion function. BeforeToType = Context.getTagDeclType(Conv->getParent()); } else { const CXXConstructorDecl *Ctor = cast(FD); CastKind = CK_ConstructorConversion; // Do no conversion if dealing with ... for the first conversion. if (!ICS.UserDefined.EllipsisConversion) { // If the user-defined conversion is specified by a constructor, the // initial standard conversion sequence converts the source type to // the type required by the argument of the constructor BeforeToType = Ctor->getParamDecl(0)->getType().getNonReferenceType(); } } // Watch out for ellipsis conversion. if (!ICS.UserDefined.EllipsisConversion) { ExprResult Res = PerformImplicitConversion(From, BeforeToType, ICS.UserDefined.Before, AA_Converting, CCK); if (Res.isInvalid()) return ExprError(); From = Res.get(); } ExprResult CastArg = BuildCXXCastArgument( *this, From->getBeginLoc(), ToType.getNonReferenceType(), CastKind, cast(FD), ICS.UserDefined.FoundConversionFunction, ICS.UserDefined.HadMultipleCandidates, From); if (CastArg.isInvalid()) return ExprError(); From = CastArg.get(); // C++ [over.match.oper]p7: // [...] the second standard conversion sequence of a user-defined // conversion sequence is not applied. if (CCK == CCK_ForBuiltinOverloadedOp) return From; return PerformImplicitConversion(From, ToType, ICS.UserDefined.After, AA_Converting, CCK); } case ImplicitConversionSequence::AmbiguousConversion: ICS.DiagnoseAmbiguousConversion(*this, From->getExprLoc(), PDiag(diag::err_typecheck_ambiguous_condition) << From->getSourceRange()); return ExprError(); case ImplicitConversionSequence::EllipsisConversion: case ImplicitConversionSequence::StaticObjectArgumentConversion: llvm_unreachable("bad conversion"); case ImplicitConversionSequence::BadConversion: Sema::AssignConvertType ConvTy = CheckAssignmentConstraints(From->getExprLoc(), ToType, From->getType()); bool Diagnosed = DiagnoseAssignmentResult( ConvTy == Compatible ? Incompatible : ConvTy, From->getExprLoc(), ToType, From->getType(), From, Action); assert(Diagnosed && "failed to diagnose bad conversion"); (void)Diagnosed; return ExprError(); } // Everything went well. return From; } /// PerformImplicitConversion - Perform an implicit conversion of the /// expression From to the type ToType by following the standard /// conversion sequence SCS. Returns the converted /// expression. Flavor is the context in which we're performing this /// conversion, for use in error messages. ExprResult Sema::PerformImplicitConversion(Expr *From, QualType ToType, const StandardConversionSequence& SCS, AssignmentAction Action, CheckedConversionKind CCK) { bool CStyle = (CCK == CCK_CStyleCast || CCK == CCK_FunctionalCast); // Overall FIXME: we are recomputing too many types here and doing far too // much extra work. What this means is that we need to keep track of more // information that is computed when we try the implicit conversion initially, // so that we don't need to recompute anything here. QualType FromType = From->getType(); if (SCS.CopyConstructor) { // FIXME: When can ToType be a reference type? assert(!ToType->isReferenceType()); if (SCS.Second == ICK_Derived_To_Base) { SmallVector ConstructorArgs; if (CompleteConstructorCall( cast(SCS.CopyConstructor), ToType, From, /*FIXME:ConstructLoc*/ SourceLocation(), ConstructorArgs)) return ExprError(); return BuildCXXConstructExpr( /*FIXME:ConstructLoc*/ SourceLocation(), ToType, SCS.FoundCopyConstructor, SCS.CopyConstructor, ConstructorArgs, /*HadMultipleCandidates*/ false, /*ListInit*/ false, /*StdInitListInit*/ false, /*ZeroInit*/ false, CXXConstructExpr::CK_Complete, SourceRange()); } return BuildCXXConstructExpr( /*FIXME:ConstructLoc*/ SourceLocation(), ToType, SCS.FoundCopyConstructor, SCS.CopyConstructor, From, /*HadMultipleCandidates*/ false, /*ListInit*/ false, /*StdInitListInit*/ false, /*ZeroInit*/ false, CXXConstructExpr::CK_Complete, SourceRange()); } // Resolve overloaded function references. if (Context.hasSameType(FromType, Context.OverloadTy)) { DeclAccessPair Found; FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(From, ToType, true, Found); if (!Fn) return ExprError(); if (DiagnoseUseOfDecl(Fn, From->getBeginLoc())) return ExprError(); From = FixOverloadedFunctionReference(From, Found, Fn); // We might get back another placeholder expression if we resolved to a // builtin. ExprResult Checked = CheckPlaceholderExpr(From); if (Checked.isInvalid()) return ExprError(); From = Checked.get(); FromType = From->getType(); } // If we're converting to an atomic type, first convert to the corresponding // non-atomic type. QualType ToAtomicType; if (const AtomicType *ToAtomic = ToType->getAs()) { ToAtomicType = ToType; ToType = ToAtomic->getValueType(); } QualType InitialFromType = FromType; // Perform the first implicit conversion. switch (SCS.First) { case ICK_Identity: if (const AtomicType *FromAtomic = FromType->getAs()) { FromType = FromAtomic->getValueType().getUnqualifiedType(); From = ImplicitCastExpr::Create(Context, FromType, CK_AtomicToNonAtomic, From, /*BasePath=*/nullptr, VK_PRValue, FPOptionsOverride()); } break; case ICK_Lvalue_To_Rvalue: { assert(From->getObjectKind() != OK_ObjCProperty); ExprResult FromRes = DefaultLvalueConversion(From); if (FromRes.isInvalid()) return ExprError(); From = FromRes.get(); FromType = From->getType(); break; } case ICK_Array_To_Pointer: FromType = Context.getArrayDecayedType(FromType); From = ImpCastExprToType(From, FromType, CK_ArrayToPointerDecay, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_Function_To_Pointer: FromType = Context.getPointerType(FromType); From = ImpCastExprToType(From, FromType, CK_FunctionToPointerDecay, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; default: llvm_unreachable("Improper first standard conversion"); } // Perform the second implicit conversion switch (SCS.Second) { case ICK_Identity: // C++ [except.spec]p5: // [For] assignment to and initialization of pointers to functions, // pointers to member functions, and references to functions: the // target entity shall allow at least the exceptions allowed by the // source value in the assignment or initialization. switch (Action) { case AA_Assigning: case AA_Initializing: // Note, function argument passing and returning are initialization. case AA_Passing: case AA_Returning: case AA_Sending: case AA_Passing_CFAudited: if (CheckExceptionSpecCompatibility(From, ToType)) return ExprError(); break; case AA_Casting: case AA_Converting: // Casts and implicit conversions are not initialization, so are not // checked for exception specification mismatches. break; } // Nothing else to do. break; case ICK_Integral_Promotion: case ICK_Integral_Conversion: if (ToType->isBooleanType()) { assert(FromType->castAs()->getDecl()->isFixed() && SCS.Second == ICK_Integral_Promotion && "only enums with fixed underlying type can promote to bool"); From = ImpCastExprToType(From, ToType, CK_IntegralToBoolean, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); } else { From = ImpCastExprToType(From, ToType, CK_IntegralCast, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); } break; case ICK_Floating_Promotion: case ICK_Floating_Conversion: From = ImpCastExprToType(From, ToType, CK_FloatingCast, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_Complex_Promotion: case ICK_Complex_Conversion: { QualType FromEl = From->getType()->castAs()->getElementType(); QualType ToEl = ToType->castAs()->getElementType(); CastKind CK; if (FromEl->isRealFloatingType()) { if (ToEl->isRealFloatingType()) CK = CK_FloatingComplexCast; else CK = CK_FloatingComplexToIntegralComplex; } else if (ToEl->isRealFloatingType()) { CK = CK_IntegralComplexToFloatingComplex; } else { CK = CK_IntegralComplexCast; } From = ImpCastExprToType(From, ToType, CK, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; } case ICK_Floating_Integral: if (ToType->isRealFloatingType()) From = ImpCastExprToType(From, ToType, CK_IntegralToFloating, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); else From = ImpCastExprToType(From, ToType, CK_FloatingToIntegral, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_Compatible_Conversion: From = ImpCastExprToType(From, ToType, CK_NoOp, From->getValueKind(), /*BasePath=*/nullptr, CCK).get(); break; case ICK_Writeback_Conversion: case ICK_Pointer_Conversion: { if (SCS.IncompatibleObjC && Action != AA_Casting) { // Diagnose incompatible Objective-C conversions if (Action == AA_Initializing || Action == AA_Assigning) Diag(From->getBeginLoc(), diag::ext_typecheck_convert_incompatible_pointer) << ToType << From->getType() << Action << From->getSourceRange() << 0; else Diag(From->getBeginLoc(), diag::ext_typecheck_convert_incompatible_pointer) << From->getType() << ToType << Action << From->getSourceRange() << 0; if (From->getType()->isObjCObjectPointerType() && ToType->isObjCObjectPointerType()) EmitRelatedResultTypeNote(From); } else if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() && !CheckObjCARCUnavailableWeakConversion(ToType, From->getType())) { if (Action == AA_Initializing) Diag(From->getBeginLoc(), diag::err_arc_weak_unavailable_assign); else Diag(From->getBeginLoc(), diag::err_arc_convesion_of_weak_unavailable) << (Action == AA_Casting) << From->getType() << ToType << From->getSourceRange(); } // Defer address space conversion to the third conversion. QualType FromPteeType = From->getType()->getPointeeType(); QualType ToPteeType = ToType->getPointeeType(); QualType NewToType = ToType; if (!FromPteeType.isNull() && !ToPteeType.isNull() && FromPteeType.getAddressSpace() != ToPteeType.getAddressSpace()) { NewToType = Context.removeAddrSpaceQualType(ToPteeType); NewToType = Context.getAddrSpaceQualType(NewToType, FromPteeType.getAddressSpace()); if (ToType->isObjCObjectPointerType()) NewToType = Context.getObjCObjectPointerType(NewToType); else if (ToType->isBlockPointerType()) NewToType = Context.getBlockPointerType(NewToType); else NewToType = Context.getPointerType(NewToType); } CastKind Kind; CXXCastPath BasePath; if (CheckPointerConversion(From, NewToType, Kind, BasePath, CStyle)) return ExprError(); // Make sure we extend blocks if necessary. // FIXME: doing this here is really ugly. if (Kind == CK_BlockPointerToObjCPointerCast) { ExprResult E = From; (void) PrepareCastToObjCObjectPointer(E); From = E.get(); } if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers()) CheckObjCConversion(SourceRange(), NewToType, From, CCK); From = ImpCastExprToType(From, NewToType, Kind, VK_PRValue, &BasePath, CCK) .get(); break; } case ICK_Pointer_Member: { CastKind Kind; CXXCastPath BasePath; if (CheckMemberPointerConversion(From, ToType, Kind, BasePath, CStyle)) return ExprError(); if (CheckExceptionSpecCompatibility(From, ToType)) return ExprError(); // We may not have been able to figure out what this member pointer resolved // to up until this exact point. Attempt to lock-in it's inheritance model. if (Context.getTargetInfo().getCXXABI().isMicrosoft()) { (void)isCompleteType(From->getExprLoc(), From->getType()); (void)isCompleteType(From->getExprLoc(), ToType); } From = ImpCastExprToType(From, ToType, Kind, VK_PRValue, &BasePath, CCK).get(); break; } case ICK_Boolean_Conversion: // Perform half-to-boolean conversion via float. if (From->getType()->isHalfType()) { From = ImpCastExprToType(From, Context.FloatTy, CK_FloatingCast).get(); FromType = Context.FloatTy; } From = ImpCastExprToType(From, Context.BoolTy, ScalarTypeToBooleanCastKind(FromType), VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_Derived_To_Base: { CXXCastPath BasePath; if (CheckDerivedToBaseConversion( From->getType(), ToType.getNonReferenceType(), From->getBeginLoc(), From->getSourceRange(), &BasePath, CStyle)) return ExprError(); From = ImpCastExprToType(From, ToType.getNonReferenceType(), CK_DerivedToBase, From->getValueKind(), &BasePath, CCK).get(); break; } case ICK_Vector_Conversion: From = ImpCastExprToType(From, ToType, CK_BitCast, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_SVE_Vector_Conversion: case ICK_RVV_Vector_Conversion: From = ImpCastExprToType(From, ToType, CK_BitCast, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_Vector_Splat: { // Vector splat from any arithmetic type to a vector. Expr *Elem = prepareVectorSplat(ToType, From).get(); From = ImpCastExprToType(Elem, ToType, CK_VectorSplat, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; } case ICK_Complex_Real: // Case 1. x -> _Complex y if (const ComplexType *ToComplex = ToType->getAs()) { QualType ElType = ToComplex->getElementType(); bool isFloatingComplex = ElType->isRealFloatingType(); // x -> y if (Context.hasSameUnqualifiedType(ElType, From->getType())) { // do nothing } else if (From->getType()->isRealFloatingType()) { From = ImpCastExprToType(From, ElType, isFloatingComplex ? CK_FloatingCast : CK_FloatingToIntegral).get(); } else { assert(From->getType()->isIntegerType()); From = ImpCastExprToType(From, ElType, isFloatingComplex ? CK_IntegralToFloating : CK_IntegralCast).get(); } // y -> _Complex y From = ImpCastExprToType(From, ToType, isFloatingComplex ? CK_FloatingRealToComplex : CK_IntegralRealToComplex).get(); // Case 2. _Complex x -> y } else { auto *FromComplex = From->getType()->castAs(); QualType ElType = FromComplex->getElementType(); bool isFloatingComplex = ElType->isRealFloatingType(); // _Complex x -> x From = ImpCastExprToType(From, ElType, isFloatingComplex ? CK_FloatingComplexToReal : CK_IntegralComplexToReal, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); // x -> y if (Context.hasSameUnqualifiedType(ElType, ToType)) { // do nothing } else if (ToType->isRealFloatingType()) { From = ImpCastExprToType(From, ToType, isFloatingComplex ? CK_FloatingCast : CK_IntegralToFloating, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); } else { assert(ToType->isIntegerType()); From = ImpCastExprToType(From, ToType, isFloatingComplex ? CK_FloatingToIntegral : CK_IntegralCast, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); } } break; case ICK_Block_Pointer_Conversion: { LangAS AddrSpaceL = ToType->castAs()->getPointeeType().getAddressSpace(); LangAS AddrSpaceR = FromType->castAs()->getPointeeType().getAddressSpace(); assert(Qualifiers::isAddressSpaceSupersetOf(AddrSpaceL, AddrSpaceR) && "Invalid cast"); CastKind Kind = AddrSpaceL != AddrSpaceR ? CK_AddressSpaceConversion : CK_BitCast; From = ImpCastExprToType(From, ToType.getUnqualifiedType(), Kind, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; } case ICK_TransparentUnionConversion: { ExprResult FromRes = From; Sema::AssignConvertType ConvTy = CheckTransparentUnionArgumentConstraints(ToType, FromRes); if (FromRes.isInvalid()) return ExprError(); From = FromRes.get(); assert ((ConvTy == Sema::Compatible) && "Improper transparent union conversion"); (void)ConvTy; break; } case ICK_Zero_Event_Conversion: case ICK_Zero_Queue_Conversion: From = ImpCastExprToType(From, ToType, CK_ZeroToOCLOpaqueType, From->getValueKind()).get(); break; case ICK_Lvalue_To_Rvalue: case ICK_Array_To_Pointer: case ICK_Function_To_Pointer: case ICK_Function_Conversion: case ICK_Qualification: case ICK_Num_Conversion_Kinds: case ICK_C_Only_Conversion: case ICK_Incompatible_Pointer_Conversion: llvm_unreachable("Improper second standard conversion"); } switch (SCS.Third) { case ICK_Identity: // Nothing to do. break; case ICK_Function_Conversion: // If both sides are functions (or pointers/references to them), there could // be incompatible exception declarations. if (CheckExceptionSpecCompatibility(From, ToType)) return ExprError(); From = ImpCastExprToType(From, ToType, CK_NoOp, VK_PRValue, /*BasePath=*/nullptr, CCK) .get(); break; case ICK_Qualification: { ExprValueKind VK = From->getValueKind(); CastKind CK = CK_NoOp; if (ToType->isReferenceType() && ToType->getPointeeType().getAddressSpace() != From->getType().getAddressSpace()) CK = CK_AddressSpaceConversion; if (ToType->isPointerType() && ToType->getPointeeType().getAddressSpace() != From->getType()->getPointeeType().getAddressSpace()) CK = CK_AddressSpaceConversion; if (!isCast(CCK) && !ToType->getPointeeType().getQualifiers().hasUnaligned() && From->getType()->getPointeeType().getQualifiers().hasUnaligned()) { Diag(From->getBeginLoc(), diag::warn_imp_cast_drops_unaligned) << InitialFromType << ToType; } From = ImpCastExprToType(From, ToType.getNonLValueExprType(Context), CK, VK, /*BasePath=*/nullptr, CCK) .get(); if (SCS.DeprecatedStringLiteralToCharPtr && !getLangOpts().WritableStrings) { Diag(From->getBeginLoc(), getLangOpts().CPlusPlus11 ? diag::ext_deprecated_string_literal_conversion : diag::warn_deprecated_string_literal_conversion) << ToType.getNonReferenceType(); } break; } default: llvm_unreachable("Improper third standard conversion"); } // If this conversion sequence involved a scalar -> atomic conversion, perform // that conversion now. if (!ToAtomicType.isNull()) { assert(Context.hasSameType( ToAtomicType->castAs()->getValueType(), From->getType())); From = ImpCastExprToType(From, ToAtomicType, CK_NonAtomicToAtomic, VK_PRValue, nullptr, CCK) .get(); } // Materialize a temporary if we're implicitly converting to a reference // type. This is not required by the C++ rules but is necessary to maintain // AST invariants. if (ToType->isReferenceType() && From->isPRValue()) { ExprResult Res = TemporaryMaterializationConversion(From); if (Res.isInvalid()) return ExprError(); From = Res.get(); } // If this conversion sequence succeeded and involved implicitly converting a // _Nullable type to a _Nonnull one, complain. if (!isCast(CCK)) diagnoseNullableToNonnullConversion(ToType, InitialFromType, From->getBeginLoc()); return From; } /// Check the completeness of a type in a unary type trait. /// /// If the particular type trait requires a complete type, tries to complete /// it. If completing the type fails, a diagnostic is emitted and false /// returned. If completing the type succeeds or no completion was required, /// returns true. static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, SourceLocation Loc, QualType ArgTy) { // C++0x [meta.unary.prop]p3: // For all of the class templates X declared in this Clause, instantiating // that template with a template argument that is a class template // specialization may result in the implicit instantiation of the template // argument if and only if the semantics of X require that the argument // must be a complete type. // We apply this rule to all the type trait expressions used to implement // these class templates. We also try to follow any GCC documented behavior // in these expressions to ensure portability of standard libraries. switch (UTT) { default: llvm_unreachable("not a UTT"); // is_complete_type somewhat obviously cannot require a complete type. case UTT_IsCompleteType: // Fall-through // These traits are modeled on the type predicates in C++0x // [meta.unary.cat] and [meta.unary.comp]. They are not specified as // requiring a complete type, as whether or not they return true cannot be // impacted by the completeness of the type. case UTT_IsVoid: case UTT_IsIntegral: case UTT_IsFloatingPoint: case UTT_IsArray: case UTT_IsBoundedArray: case UTT_IsPointer: case UTT_IsNullPointer: case UTT_IsReferenceable: case UTT_IsLvalueReference: case UTT_IsRvalueReference: case UTT_IsMemberFunctionPointer: case UTT_IsMemberObjectPointer: case UTT_IsEnum: case UTT_IsScopedEnum: case UTT_IsUnion: case UTT_IsClass: case UTT_IsFunction: case UTT_IsReference: case UTT_IsArithmetic: case UTT_IsFundamental: case UTT_IsObject: case UTT_IsScalar: case UTT_IsCompound: case UTT_IsMemberPointer: // Fall-through // These traits are modeled on type predicates in C++0x [meta.unary.prop] // which requires some of its traits to have the complete type. However, // the completeness of the type cannot impact these traits' semantics, and // so they don't require it. This matches the comments on these traits in // Table 49. case UTT_IsConst: case UTT_IsVolatile: case UTT_IsSigned: case UTT_IsUnboundedArray: case UTT_IsUnsigned: // This type trait always returns false, checking the type is moot. case UTT_IsInterfaceClass: return true; // C++14 [meta.unary.prop]: // If T is a non-union class type, T shall be a complete type. case UTT_IsEmpty: case UTT_IsPolymorphic: case UTT_IsAbstract: if (const auto *RD = ArgTy->getAsCXXRecordDecl()) if (!RD->isUnion()) return !S.RequireCompleteType( Loc, ArgTy, diag::err_incomplete_type_used_in_type_trait_expr); return true; // C++14 [meta.unary.prop]: // If T is a class type, T shall be a complete type. case UTT_IsFinal: case UTT_IsSealed: if (ArgTy->getAsCXXRecordDecl()) return !S.RequireCompleteType( Loc, ArgTy, diag::err_incomplete_type_used_in_type_trait_expr); return true; // LWG3823: T shall be an array type, a complete type, or cv void. case UTT_IsAggregate: if (ArgTy->isArrayType() || ArgTy->isVoidType()) return true; return !S.RequireCompleteType( Loc, ArgTy, diag::err_incomplete_type_used_in_type_trait_expr); // C++1z [meta.unary.prop]: // remove_all_extents_t shall be a complete type or cv void. case UTT_IsTrivial: case UTT_IsTriviallyCopyable: case UTT_IsStandardLayout: case UTT_IsPOD: case UTT_IsLiteral: // By analogy, is_trivially_relocatable and is_trivially_equality_comparable // impose the same constraints. case UTT_IsTriviallyRelocatable: case UTT_IsTriviallyEqualityComparable: case UTT_CanPassInRegs: // Per the GCC type traits documentation, T shall be a complete type, cv void, // or an array of unknown bound. But GCC actually imposes the same constraints // as above. case UTT_HasNothrowAssign: case UTT_HasNothrowMoveAssign: case UTT_HasNothrowConstructor: case UTT_HasNothrowCopy: case UTT_HasTrivialAssign: case UTT_HasTrivialMoveAssign: case UTT_HasTrivialDefaultConstructor: case UTT_HasTrivialMoveConstructor: case UTT_HasTrivialCopy: case UTT_HasTrivialDestructor: case UTT_HasVirtualDestructor: ArgTy = QualType(ArgTy->getBaseElementTypeUnsafe(), 0); [[fallthrough]]; // C++1z [meta.unary.prop]: // T shall be a complete type, cv void, or an array of unknown bound. case UTT_IsDestructible: case UTT_IsNothrowDestructible: case UTT_IsTriviallyDestructible: case UTT_HasUniqueObjectRepresentations: if (ArgTy->isIncompleteArrayType() || ArgTy->isVoidType()) return true; return !S.RequireCompleteType( Loc, ArgTy, diag::err_incomplete_type_used_in_type_trait_expr); } } static bool HasNoThrowOperator(const RecordType *RT, OverloadedOperatorKind Op, Sema &Self, SourceLocation KeyLoc, ASTContext &C, bool (CXXRecordDecl::*HasTrivial)() const, bool (CXXRecordDecl::*HasNonTrivial)() const, bool (CXXMethodDecl::*IsDesiredOp)() const) { CXXRecordDecl *RD = cast(RT->getDecl()); if ((RD->*HasTrivial)() && !(RD->*HasNonTrivial)()) return true; DeclarationName Name = C.DeclarationNames.getCXXOperatorName(Op); DeclarationNameInfo NameInfo(Name, KeyLoc); LookupResult Res(Self, NameInfo, Sema::LookupOrdinaryName); if (Self.LookupQualifiedName(Res, RD)) { bool FoundOperator = false; Res.suppressDiagnostics(); for (LookupResult::iterator Op = Res.begin(), OpEnd = Res.end(); Op != OpEnd; ++Op) { if (isa(*Op)) continue; CXXMethodDecl *Operator = cast(*Op); if((Operator->*IsDesiredOp)()) { FoundOperator = true; auto *CPT = Operator->getType()->castAs(); CPT = Self.ResolveExceptionSpec(KeyLoc, CPT); if (!CPT || !CPT->isNothrow()) return false; } } return FoundOperator; } return false; } static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT, SourceLocation KeyLoc, QualType T) { assert(!T->isDependentType() && "Cannot evaluate traits of dependent type"); ASTContext &C = Self.Context; switch(UTT) { default: llvm_unreachable("not a UTT"); // Type trait expressions corresponding to the primary type category // predicates in C++0x [meta.unary.cat]. case UTT_IsVoid: return T->isVoidType(); case UTT_IsIntegral: return T->isIntegralType(C); case UTT_IsFloatingPoint: return T->isFloatingType(); case UTT_IsArray: return T->isArrayType(); case UTT_IsBoundedArray: if (!T->isVariableArrayType()) { return T->isArrayType() && !T->isIncompleteArrayType(); } Self.Diag(KeyLoc, diag::err_vla_unsupported) << 1 << tok::kw___is_bounded_array; return false; case UTT_IsUnboundedArray: if (!T->isVariableArrayType()) { return T->isIncompleteArrayType(); } Self.Diag(KeyLoc, diag::err_vla_unsupported) << 1 << tok::kw___is_unbounded_array; return false; case UTT_IsPointer: return T->isAnyPointerType(); case UTT_IsNullPointer: return T->isNullPtrType(); case UTT_IsLvalueReference: return T->isLValueReferenceType(); case UTT_IsRvalueReference: return T->isRValueReferenceType(); case UTT_IsMemberFunctionPointer: return T->isMemberFunctionPointerType(); case UTT_IsMemberObjectPointer: return T->isMemberDataPointerType(); case UTT_IsEnum: return T->isEnumeralType(); case UTT_IsScopedEnum: return T->isScopedEnumeralType(); case UTT_IsUnion: return T->isUnionType(); case UTT_IsClass: return T->isClassType() || T->isStructureType() || T->isInterfaceType(); case UTT_IsFunction: return T->isFunctionType(); // Type trait expressions which correspond to the convenient composition // predicates in C++0x [meta.unary.comp]. case UTT_IsReference: return T->isReferenceType(); case UTT_IsArithmetic: return T->isArithmeticType() && !T->isEnumeralType(); case UTT_IsFundamental: return T->isFundamentalType(); case UTT_IsObject: return T->isObjectType(); case UTT_IsScalar: // Note: semantic analysis depends on Objective-C lifetime types to be // considered scalar types. However, such types do not actually behave // like scalar types at run time (since they may require retain/release // operations), so we report them as non-scalar. if (T->isObjCLifetimeType()) { switch (T.getObjCLifetime()) { case Qualifiers::OCL_None: case Qualifiers::OCL_ExplicitNone: return true; case Qualifiers::OCL_Strong: case Qualifiers::OCL_Weak: case Qualifiers::OCL_Autoreleasing: return false; } } return T->isScalarType(); case UTT_IsCompound: return T->isCompoundType(); case UTT_IsMemberPointer: return T->isMemberPointerType(); // Type trait expressions which correspond to the type property predicates // in C++0x [meta.unary.prop]. case UTT_IsConst: return T.isConstQualified(); case UTT_IsVolatile: return T.isVolatileQualified(); case UTT_IsTrivial: return T.isTrivialType(C); case UTT_IsTriviallyCopyable: return T.isTriviallyCopyableType(C); case UTT_IsStandardLayout: return T->isStandardLayoutType(); case UTT_IsPOD: return T.isPODType(C); case UTT_IsLiteral: return T->isLiteralType(C); case UTT_IsEmpty: if (const CXXRecordDecl *RD = T->getAsCXXRecordDecl()) return !RD->isUnion() && RD->isEmpty(); return false; case UTT_IsPolymorphic: if (const CXXRecordDecl *RD = T->getAsCXXRecordDecl()) return !RD->isUnion() && RD->isPolymorphic(); return false; case UTT_IsAbstract: if (const CXXRecordDecl *RD = T->getAsCXXRecordDecl()) return !RD->isUnion() && RD->isAbstract(); return false; case UTT_IsAggregate: // Report vector extensions and complex types as aggregates because they // support aggregate initialization. GCC mirrors this behavior for vectors // but not _Complex. return T->isAggregateType() || T->isVectorType() || T->isExtVectorType() || T->isAnyComplexType(); // __is_interface_class only returns true when CL is invoked in /CLR mode and // even then only when it is used with the 'interface struct ...' syntax // Clang doesn't support /CLR which makes this type trait moot. case UTT_IsInterfaceClass: return false; case UTT_IsFinal: case UTT_IsSealed: if (const CXXRecordDecl *RD = T->getAsCXXRecordDecl()) return RD->hasAttr(); return false; case UTT_IsSigned: // Enum types should always return false. // Floating points should always return true. return T->isFloatingType() || (T->isSignedIntegerType() && !T->isEnumeralType()); case UTT_IsUnsigned: // Enum types should always return false. return T->isUnsignedIntegerType() && !T->isEnumeralType(); // Type trait expressions which query classes regarding their construction, // destruction, and copying. Rather than being based directly on the // related type predicates in the standard, they are specified by both // GCC[1] and the Embarcadero C++ compiler[2], and Clang implements those // specifications. // // 1: http://gcc.gnu/.org/onlinedocs/gcc/Type-Traits.html // 2: http://docwiki.embarcadero.com/RADStudio/XE/en/Type_Trait_Functions_(C%2B%2B0x)_Index // // Note that these builtins do not behave as documented in g++: if a class // has both a trivial and a non-trivial special member of a particular kind, // they return false! For now, we emulate this behavior. // FIXME: This appears to be a g++ bug: more complex cases reveal that it // does not correctly compute triviality in the presence of multiple special // members of the same kind. Revisit this once the g++ bug is fixed. case UTT_HasTrivialDefaultConstructor: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html: // If __is_pod (type) is true then the trait is true, else if type is // a cv class or union type (or array thereof) with a trivial default // constructor ([class.ctor]) then the trait is true, else it is false. if (T.isPODType(C)) return true; if (CXXRecordDecl *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) return RD->hasTrivialDefaultConstructor() && !RD->hasNonTrivialDefaultConstructor(); return false; case UTT_HasTrivialMoveConstructor: // This trait is implemented by MSVC 2012 and needed to parse the // standard library headers. Specifically this is used as the logic // behind std::is_trivially_move_constructible (20.9.4.3). if (T.isPODType(C)) return true; if (CXXRecordDecl *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) return RD->hasTrivialMoveConstructor() && !RD->hasNonTrivialMoveConstructor(); return false; case UTT_HasTrivialCopy: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html: // If __is_pod (type) is true or type is a reference type then // the trait is true, else if type is a cv class or union type // with a trivial copy constructor ([class.copy]) then the trait // is true, else it is false. if (T.isPODType(C) || T->isReferenceType()) return true; if (CXXRecordDecl *RD = T->getAsCXXRecordDecl()) return RD->hasTrivialCopyConstructor() && !RD->hasNonTrivialCopyConstructor(); return false; case UTT_HasTrivialMoveAssign: // This trait is implemented by MSVC 2012 and needed to parse the // standard library headers. Specifically it is used as the logic // behind std::is_trivially_move_assignable (20.9.4.3) if (T.isPODType(C)) return true; if (CXXRecordDecl *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) return RD->hasTrivialMoveAssignment() && !RD->hasNonTrivialMoveAssignment(); return false; case UTT_HasTrivialAssign: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html: // If type is const qualified or is a reference type then the // trait is false. Otherwise if __is_pod (type) is true then the // trait is true, else if type is a cv class or union type with // a trivial copy assignment ([class.copy]) then the trait is // true, else it is false. // Note: the const and reference restrictions are interesting, // given that const and reference members don't prevent a class // from having a trivial copy assignment operator (but do cause // errors if the copy assignment operator is actually used, q.v. // [class.copy]p12). if (T.isConstQualified()) return false; if (T.isPODType(C)) return true; if (CXXRecordDecl *RD = T->getAsCXXRecordDecl()) return RD->hasTrivialCopyAssignment() && !RD->hasNonTrivialCopyAssignment(); return false; case UTT_IsDestructible: case UTT_IsTriviallyDestructible: case UTT_IsNothrowDestructible: // C++14 [meta.unary.prop]: // For reference types, is_destructible::value is true. if (T->isReferenceType()) return true; // Objective-C++ ARC: autorelease types don't require destruction. if (T->isObjCLifetimeType() && T.getObjCLifetime() == Qualifiers::OCL_Autoreleasing) return true; // C++14 [meta.unary.prop]: // For incomplete types and function types, is_destructible::value is // false. if (T->isIncompleteType() || T->isFunctionType()) return false; // A type that requires destruction (via a non-trivial destructor or ARC // lifetime semantics) is not trivially-destructible. if (UTT == UTT_IsTriviallyDestructible && T.isDestructedType()) return false; // C++14 [meta.unary.prop]: // For object types and given U equal to remove_all_extents_t, if the // expression std::declval().~U() is well-formed when treated as an // unevaluated operand (Clause 5), then is_destructible::value is true if (auto *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) { CXXDestructorDecl *Destructor = Self.LookupDestructor(RD); if (!Destructor) return false; // C++14 [dcl.fct.def.delete]p2: // A program that refers to a deleted function implicitly or // explicitly, other than to declare it, is ill-formed. if (Destructor->isDeleted()) return false; if (C.getLangOpts().AccessControl && Destructor->getAccess() != AS_public) return false; if (UTT == UTT_IsNothrowDestructible) { auto *CPT = Destructor->getType()->castAs(); CPT = Self.ResolveExceptionSpec(KeyLoc, CPT); if (!CPT || !CPT->isNothrow()) return false; } } return true; case UTT_HasTrivialDestructor: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html // If __is_pod (type) is true or type is a reference type // then the trait is true, else if type is a cv class or union // type (or array thereof) with a trivial destructor // ([class.dtor]) then the trait is true, else it is // false. if (T.isPODType(C) || T->isReferenceType()) return true; // Objective-C++ ARC: autorelease types don't require destruction. if (T->isObjCLifetimeType() && T.getObjCLifetime() == Qualifiers::OCL_Autoreleasing) return true; if (CXXRecordDecl *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) return RD->hasTrivialDestructor(); return false; // TODO: Propagate nothrowness for implicitly declared special members. case UTT_HasNothrowAssign: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html: // If type is const qualified or is a reference type then the // trait is false. Otherwise if __has_trivial_assign (type) // is true then the trait is true, else if type is a cv class // or union type with copy assignment operators that are known // not to throw an exception then the trait is true, else it is // false. if (C.getBaseElementType(T).isConstQualified()) return false; if (T->isReferenceType()) return false; if (T.isPODType(C) || T->isObjCLifetimeType()) return true; if (const RecordType *RT = T->getAs()) return HasNoThrowOperator(RT, OO_Equal, Self, KeyLoc, C, &CXXRecordDecl::hasTrivialCopyAssignment, &CXXRecordDecl::hasNonTrivialCopyAssignment, &CXXMethodDecl::isCopyAssignmentOperator); return false; case UTT_HasNothrowMoveAssign: // This trait is implemented by MSVC 2012 and needed to parse the // standard library headers. Specifically this is used as the logic // behind std::is_nothrow_move_assignable (20.9.4.3). if (T.isPODType(C)) return true; if (const RecordType *RT = C.getBaseElementType(T)->getAs()) return HasNoThrowOperator(RT, OO_Equal, Self, KeyLoc, C, &CXXRecordDecl::hasTrivialMoveAssignment, &CXXRecordDecl::hasNonTrivialMoveAssignment, &CXXMethodDecl::isMoveAssignmentOperator); return false; case UTT_HasNothrowCopy: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html: // If __has_trivial_copy (type) is true then the trait is true, else // if type is a cv class or union type with copy constructors that are // known not to throw an exception then the trait is true, else it is // false. if (T.isPODType(C) || T->isReferenceType() || T->isObjCLifetimeType()) return true; if (CXXRecordDecl *RD = T->getAsCXXRecordDecl()) { if (RD->hasTrivialCopyConstructor() && !RD->hasNonTrivialCopyConstructor()) return true; bool FoundConstructor = false; unsigned FoundTQs; for (const auto *ND : Self.LookupConstructors(RD)) { // A template constructor is never a copy constructor. // FIXME: However, it may actually be selected at the actual overload // resolution point. if (isa(ND->getUnderlyingDecl())) continue; // UsingDecl itself is not a constructor if (isa(ND)) continue; auto *Constructor = cast(ND->getUnderlyingDecl()); if (Constructor->isCopyConstructor(FoundTQs)) { FoundConstructor = true; auto *CPT = Constructor->getType()->castAs(); CPT = Self.ResolveExceptionSpec(KeyLoc, CPT); if (!CPT) return false; // TODO: check whether evaluating default arguments can throw. // For now, we'll be conservative and assume that they can throw. if (!CPT->isNothrow() || CPT->getNumParams() > 1) return false; } } return FoundConstructor; } return false; case UTT_HasNothrowConstructor: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html // If __has_trivial_constructor (type) is true then the trait is // true, else if type is a cv class or union type (or array // thereof) with a default constructor that is known not to // throw an exception then the trait is true, else it is false. if (T.isPODType(C) || T->isObjCLifetimeType()) return true; if (CXXRecordDecl *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) { if (RD->hasTrivialDefaultConstructor() && !RD->hasNonTrivialDefaultConstructor()) return true; bool FoundConstructor = false; for (const auto *ND : Self.LookupConstructors(RD)) { // FIXME: In C++0x, a constructor template can be a default constructor. if (isa(ND->getUnderlyingDecl())) continue; // UsingDecl itself is not a constructor if (isa(ND)) continue; auto *Constructor = cast(ND->getUnderlyingDecl()); if (Constructor->isDefaultConstructor()) { FoundConstructor = true; auto *CPT = Constructor->getType()->castAs(); CPT = Self.ResolveExceptionSpec(KeyLoc, CPT); if (!CPT) return false; // FIXME: check whether evaluating default arguments can throw. // For now, we'll be conservative and assume that they can throw. if (!CPT->isNothrow() || CPT->getNumParams() > 0) return false; } } return FoundConstructor; } return false; case UTT_HasVirtualDestructor: // http://gcc.gnu.org/onlinedocs/gcc/Type-Traits.html: // If type is a class type with a virtual destructor ([class.dtor]) // then the trait is true, else it is false. if (CXXRecordDecl *RD = T->getAsCXXRecordDecl()) if (CXXDestructorDecl *Destructor = Self.LookupDestructor(RD)) return Destructor->isVirtual(); return false; // These type trait expressions are modeled on the specifications for the // Embarcadero C++0x type trait functions: // http://docwiki.embarcadero.com/RADStudio/XE/en/Type_Trait_Functions_(C%2B%2B0x)_Index case UTT_IsCompleteType: // http://docwiki.embarcadero.com/RADStudio/XE/en/Is_complete_type_(typename_T_): // Returns True if and only if T is a complete type at the point of the // function call. return !T->isIncompleteType(); case UTT_HasUniqueObjectRepresentations: return C.hasUniqueObjectRepresentations(T); case UTT_IsTriviallyRelocatable: return T.isTriviallyRelocatableType(C); case UTT_IsReferenceable: return T.isReferenceable(); case UTT_CanPassInRegs: if (CXXRecordDecl *RD = T->getAsCXXRecordDecl(); RD && !T.hasQualifiers()) return RD->canPassInRegisters(); Self.Diag(KeyLoc, diag::err_builtin_pass_in_regs_non_class) << T; return false; case UTT_IsTriviallyEqualityComparable: return T.isTriviallyEqualityComparableType(C); } } static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, QualType LhsT, QualType RhsT, SourceLocation KeyLoc); static bool EvaluateBooleanTypeTrait(Sema &S, TypeTrait Kind, SourceLocation KWLoc, ArrayRef Args, SourceLocation RParenLoc, bool IsDependent) { if (IsDependent) return false; if (Kind <= UTT_Last) return EvaluateUnaryTypeTrait(S, Kind, KWLoc, Args[0]->getType()); // Evaluate BTT_ReferenceBindsToTemporary alongside the IsConstructible // traits to avoid duplication. if (Kind <= BTT_Last && Kind != BTT_ReferenceBindsToTemporary) return EvaluateBinaryTypeTrait(S, Kind, Args[0]->getType(), Args[1]->getType(), RParenLoc); switch (Kind) { case clang::BTT_ReferenceBindsToTemporary: case clang::TT_IsConstructible: case clang::TT_IsNothrowConstructible: case clang::TT_IsTriviallyConstructible: { // C++11 [meta.unary.prop]: // is_trivially_constructible is defined as: // // is_constructible::value is true and the variable // definition for is_constructible, as defined below, is known to call // no operation that is not trivial. // // The predicate condition for a template specialization // is_constructible shall be satisfied if and only if the // following variable definition would be well-formed for some invented // variable t: // // T t(create()...); assert(!Args.empty()); // Precondition: T and all types in the parameter pack Args shall be // complete types, (possibly cv-qualified) void, or arrays of // unknown bound. for (const auto *TSI : Args) { QualType ArgTy = TSI->getType(); if (ArgTy->isVoidType() || ArgTy->isIncompleteArrayType()) continue; if (S.RequireCompleteType(KWLoc, ArgTy, diag::err_incomplete_type_used_in_type_trait_expr)) return false; } // Make sure the first argument is not incomplete nor a function type. QualType T = Args[0]->getType(); if (T->isIncompleteType() || T->isFunctionType()) return false; // Make sure the first argument is not an abstract type. CXXRecordDecl *RD = T->getAsCXXRecordDecl(); if (RD && RD->isAbstract()) return false; llvm::BumpPtrAllocator OpaqueExprAllocator; SmallVector ArgExprs; ArgExprs.reserve(Args.size() - 1); for (unsigned I = 1, N = Args.size(); I != N; ++I) { QualType ArgTy = Args[I]->getType(); if (ArgTy->isObjectType() || ArgTy->isFunctionType()) ArgTy = S.Context.getRValueReferenceType(ArgTy); ArgExprs.push_back( new (OpaqueExprAllocator.Allocate()) OpaqueValueExpr(Args[I]->getTypeLoc().getBeginLoc(), ArgTy.getNonLValueExprType(S.Context), Expr::getValueKindForType(ArgTy))); } // Perform the initialization in an unevaluated context within a SFINAE // trap at translation unit scope. EnterExpressionEvaluationContext Unevaluated( S, Sema::ExpressionEvaluationContext::Unevaluated); Sema::SFINAETrap SFINAE(S, /*AccessCheckingSFINAE=*/true); Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl()); InitializedEntity To( InitializedEntity::InitializeTemporary(S.Context, Args[0])); InitializationKind InitKind(InitializationKind::CreateDirect(KWLoc, KWLoc, RParenLoc)); InitializationSequence Init(S, To, InitKind, ArgExprs); if (Init.Failed()) return false; ExprResult Result = Init.Perform(S, To, InitKind, ArgExprs); if (Result.isInvalid() || SFINAE.hasErrorOccurred()) return false; if (Kind == clang::TT_IsConstructible) return true; if (Kind == clang::BTT_ReferenceBindsToTemporary) { if (!T->isReferenceType()) return false; return !Init.isDirectReferenceBinding(); } if (Kind == clang::TT_IsNothrowConstructible) return S.canThrow(Result.get()) == CT_Cannot; if (Kind == clang::TT_IsTriviallyConstructible) { // Under Objective-C ARC and Weak, if the destination has non-trivial // Objective-C lifetime, this is a non-trivial construction. if (T.getNonReferenceType().hasNonTrivialObjCLifetime()) return false; // The initialization succeeded; now make sure there are no non-trivial // calls. return !Result.get()->hasNonTrivialCall(S.Context); } llvm_unreachable("unhandled type trait"); return false; } default: llvm_unreachable("not a TT"); } return false; } namespace { void DiagnoseBuiltinDeprecation(Sema& S, TypeTrait Kind, SourceLocation KWLoc) { TypeTrait Replacement; switch (Kind) { case UTT_HasNothrowAssign: case UTT_HasNothrowMoveAssign: Replacement = BTT_IsNothrowAssignable; break; case UTT_HasNothrowCopy: case UTT_HasNothrowConstructor: Replacement = TT_IsNothrowConstructible; break; case UTT_HasTrivialAssign: case UTT_HasTrivialMoveAssign: Replacement = BTT_IsTriviallyAssignable; break; case UTT_HasTrivialCopy: Replacement = UTT_IsTriviallyCopyable; break; case UTT_HasTrivialDefaultConstructor: case UTT_HasTrivialMoveConstructor: Replacement = TT_IsTriviallyConstructible; break; case UTT_HasTrivialDestructor: Replacement = UTT_IsTriviallyDestructible; break; default: return; } S.Diag(KWLoc, diag::warn_deprecated_builtin) << getTraitSpelling(Kind) << getTraitSpelling(Replacement); } } bool Sema::CheckTypeTraitArity(unsigned Arity, SourceLocation Loc, size_t N) { if (Arity && N != Arity) { Diag(Loc, diag::err_type_trait_arity) << Arity << 0 << (Arity > 1) << (int)N << SourceRange(Loc); return false; } if (!Arity && N == 0) { Diag(Loc, diag::err_type_trait_arity) << 1 << 1 << 1 << (int)N << SourceRange(Loc); return false; } return true; } enum class TypeTraitReturnType { Bool, }; static TypeTraitReturnType GetReturnType(TypeTrait Kind) { return TypeTraitReturnType::Bool; } ExprResult Sema::BuildTypeTrait(TypeTrait Kind, SourceLocation KWLoc, ArrayRef Args, SourceLocation RParenLoc) { if (!CheckTypeTraitArity(getTypeTraitArity(Kind), KWLoc, Args.size())) return ExprError(); if (Kind <= UTT_Last && !CheckUnaryTypeTraitTypeCompleteness( *this, Kind, KWLoc, Args[0]->getType())) return ExprError(); DiagnoseBuiltinDeprecation(*this, Kind, KWLoc); bool Dependent = false; for (unsigned I = 0, N = Args.size(); I != N; ++I) { if (Args[I]->getType()->isDependentType()) { Dependent = true; break; } } switch (GetReturnType(Kind)) { case TypeTraitReturnType::Bool: { bool Result = EvaluateBooleanTypeTrait(*this, Kind, KWLoc, Args, RParenLoc, Dependent); return TypeTraitExpr::Create(Context, Context.getLogicalOperationType(), KWLoc, Kind, Args, RParenLoc, Result); } } llvm_unreachable("unhandled type trait return type"); } ExprResult Sema::ActOnTypeTrait(TypeTrait Kind, SourceLocation KWLoc, ArrayRef Args, SourceLocation RParenLoc) { SmallVector ConvertedArgs; ConvertedArgs.reserve(Args.size()); for (unsigned I = 0, N = Args.size(); I != N; ++I) { TypeSourceInfo *TInfo; QualType T = GetTypeFromParser(Args[I], &TInfo); if (!TInfo) TInfo = Context.getTrivialTypeSourceInfo(T, KWLoc); ConvertedArgs.push_back(TInfo); } return BuildTypeTrait(Kind, KWLoc, ConvertedArgs, RParenLoc); } static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, QualType LhsT, QualType RhsT, SourceLocation KeyLoc) { assert(!LhsT->isDependentType() && !RhsT->isDependentType() && "Cannot evaluate traits of dependent types"); switch(BTT) { case BTT_IsBaseOf: { // C++0x [meta.rel]p2 // Base is a base class of Derived without regard to cv-qualifiers or // Base and Derived are not unions and name the same class type without // regard to cv-qualifiers. const RecordType *lhsRecord = LhsT->getAs(); const RecordType *rhsRecord = RhsT->getAs(); if (!rhsRecord || !lhsRecord) { const ObjCObjectType *LHSObjTy = LhsT->getAs(); const ObjCObjectType *RHSObjTy = RhsT->getAs(); if (!LHSObjTy || !RHSObjTy) return false; ObjCInterfaceDecl *BaseInterface = LHSObjTy->getInterface(); ObjCInterfaceDecl *DerivedInterface = RHSObjTy->getInterface(); if (!BaseInterface || !DerivedInterface) return false; if (Self.RequireCompleteType( KeyLoc, RhsT, diag::err_incomplete_type_used_in_type_trait_expr)) return false; return BaseInterface->isSuperClassOf(DerivedInterface); } assert(Self.Context.hasSameUnqualifiedType(LhsT, RhsT) == (lhsRecord == rhsRecord)); // Unions are never base classes, and never have base classes. // It doesn't matter if they are complete or not. See PR#41843 if (lhsRecord && lhsRecord->getDecl()->isUnion()) return false; if (rhsRecord && rhsRecord->getDecl()->isUnion()) return false; if (lhsRecord == rhsRecord) return true; // C++0x [meta.rel]p2: // If Base and Derived are class types and are different types // (ignoring possible cv-qualifiers) then Derived shall be a // complete type. if (Self.RequireCompleteType(KeyLoc, RhsT, diag::err_incomplete_type_used_in_type_trait_expr)) return false; return cast(rhsRecord->getDecl()) ->isDerivedFrom(cast(lhsRecord->getDecl())); } case BTT_IsSame: return Self.Context.hasSameType(LhsT, RhsT); case BTT_TypeCompatible: { // GCC ignores cv-qualifiers on arrays for this builtin. Qualifiers LhsQuals, RhsQuals; QualType Lhs = Self.getASTContext().getUnqualifiedArrayType(LhsT, LhsQuals); QualType Rhs = Self.getASTContext().getUnqualifiedArrayType(RhsT, RhsQuals); return Self.Context.typesAreCompatible(Lhs, Rhs); } case BTT_IsConvertible: case BTT_IsConvertibleTo: { // C++0x [meta.rel]p4: // Given the following function prototype: // // template // typename add_rvalue_reference::type create(); // // the predicate condition for a template specialization // is_convertible shall be satisfied if and only if // the return expression in the following code would be // well-formed, including any implicit conversions to the return // type of the function: // // To test() { // return create(); // } // // Access checking is performed as if in a context unrelated to To and // From. Only the validity of the immediate context of the expression // of the return-statement (including conversions to the return type) // is considered. // // We model the initialization as a copy-initialization of a temporary // of the appropriate type, which for this expression is identical to the // return statement (since NRVO doesn't apply). // Functions aren't allowed to return function or array types. if (RhsT->isFunctionType() || RhsT->isArrayType()) return false; // A return statement in a void function must have void type. if (RhsT->isVoidType()) return LhsT->isVoidType(); // A function definition requires a complete, non-abstract return type. if (!Self.isCompleteType(KeyLoc, RhsT) || Self.isAbstractType(KeyLoc, RhsT)) return false; // Compute the result of add_rvalue_reference. if (LhsT->isObjectType() || LhsT->isFunctionType()) LhsT = Self.Context.getRValueReferenceType(LhsT); // Build a fake source and destination for initialization. InitializedEntity To(InitializedEntity::InitializeTemporary(RhsT)); OpaqueValueExpr From(KeyLoc, LhsT.getNonLValueExprType(Self.Context), Expr::getValueKindForType(LhsT)); Expr *FromPtr = &From; InitializationKind Kind(InitializationKind::CreateCopy(KeyLoc, SourceLocation())); // Perform the initialization in an unevaluated context within a SFINAE // trap at translation unit scope. EnterExpressionEvaluationContext Unevaluated( Self, Sema::ExpressionEvaluationContext::Unevaluated); Sema::SFINAETrap SFINAE(Self, /*AccessCheckingSFINAE=*/true); Sema::ContextRAII TUContext(Self, Self.Context.getTranslationUnitDecl()); InitializationSequence Init(Self, To, Kind, FromPtr); if (Init.Failed()) return false; ExprResult Result = Init.Perform(Self, To, Kind, FromPtr); return !Result.isInvalid() && !SFINAE.hasErrorOccurred(); } case BTT_IsAssignable: case BTT_IsNothrowAssignable: case BTT_IsTriviallyAssignable: { // C++11 [meta.unary.prop]p3: // is_trivially_assignable is defined as: // is_assignable::value is true and the assignment, as defined by // is_assignable, is known to call no operation that is not trivial // // is_assignable is defined as: // The expression declval() = declval() is well-formed when // treated as an unevaluated operand (Clause 5). // // For both, T and U shall be complete types, (possibly cv-qualified) // void, or arrays of unknown bound. if (!LhsT->isVoidType() && !LhsT->isIncompleteArrayType() && Self.RequireCompleteType(KeyLoc, LhsT, diag::err_incomplete_type_used_in_type_trait_expr)) return false; if (!RhsT->isVoidType() && !RhsT->isIncompleteArrayType() && Self.RequireCompleteType(KeyLoc, RhsT, diag::err_incomplete_type_used_in_type_trait_expr)) return false; // cv void is never assignable. if (LhsT->isVoidType() || RhsT->isVoidType()) return false; // Build expressions that emulate the effect of declval() and // declval(). if (LhsT->isObjectType() || LhsT->isFunctionType()) LhsT = Self.Context.getRValueReferenceType(LhsT); if (RhsT->isObjectType() || RhsT->isFunctionType()) RhsT = Self.Context.getRValueReferenceType(RhsT); OpaqueValueExpr Lhs(KeyLoc, LhsT.getNonLValueExprType(Self.Context), Expr::getValueKindForType(LhsT)); OpaqueValueExpr Rhs(KeyLoc, RhsT.getNonLValueExprType(Self.Context), Expr::getValueKindForType(RhsT)); // Attempt the assignment in an unevaluated context within a SFINAE // trap at translation unit scope. EnterExpressionEvaluationContext Unevaluated( Self, Sema::ExpressionEvaluationContext::Unevaluated); Sema::SFINAETrap SFINAE(Self, /*AccessCheckingSFINAE=*/true); Sema::ContextRAII TUContext(Self, Self.Context.getTranslationUnitDecl()); ExprResult Result = Self.BuildBinOp(/*S=*/nullptr, KeyLoc, BO_Assign, &Lhs, &Rhs); if (Result.isInvalid()) return false; // Treat the assignment as unused for the purpose of -Wdeprecated-volatile. Self.CheckUnusedVolatileAssignment(Result.get()); if (SFINAE.hasErrorOccurred()) return false; if (BTT == BTT_IsAssignable) return true; if (BTT == BTT_IsNothrowAssignable) return Self.canThrow(Result.get()) == CT_Cannot; if (BTT == BTT_IsTriviallyAssignable) { // Under Objective-C ARC and Weak, if the destination has non-trivial // Objective-C lifetime, this is a non-trivial assignment. if (LhsT.getNonReferenceType().hasNonTrivialObjCLifetime()) return false; return !Result.get()->hasNonTrivialCall(Self.Context); } llvm_unreachable("unhandled type trait"); return false; } default: llvm_unreachable("not a BTT"); } llvm_unreachable("Unknown type trait or not implemented"); } ExprResult Sema::ActOnArrayTypeTrait(ArrayTypeTrait ATT, SourceLocation KWLoc, ParsedType Ty, Expr* DimExpr, SourceLocation RParen) { TypeSourceInfo *TSInfo; QualType T = GetTypeFromParser(Ty, &TSInfo); if (!TSInfo) TSInfo = Context.getTrivialTypeSourceInfo(T); return BuildArrayTypeTrait(ATT, KWLoc, TSInfo, DimExpr, RParen); } static uint64_t EvaluateArrayTypeTrait(Sema &Self, ArrayTypeTrait ATT, QualType T, Expr *DimExpr, SourceLocation KeyLoc) { assert(!T->isDependentType() && "Cannot evaluate traits of dependent type"); switch(ATT) { case ATT_ArrayRank: if (T->isArrayType()) { unsigned Dim = 0; while (const ArrayType *AT = Self.Context.getAsArrayType(T)) { ++Dim; T = AT->getElementType(); } return Dim; } return 0; case ATT_ArrayExtent: { llvm::APSInt Value; uint64_t Dim; if (Self.VerifyIntegerConstantExpression( DimExpr, &Value, diag::err_dimension_expr_not_constant_integer) .isInvalid()) return 0; if (Value.isSigned() && Value.isNegative()) { Self.Diag(KeyLoc, diag::err_dimension_expr_not_constant_integer) << DimExpr->getSourceRange(); return 0; } Dim = Value.getLimitedValue(); if (T->isArrayType()) { unsigned D = 0; bool Matched = false; while (const ArrayType *AT = Self.Context.getAsArrayType(T)) { if (Dim == D) { Matched = true; break; } ++D; T = AT->getElementType(); } if (Matched && T->isArrayType()) { if (const ConstantArrayType *CAT = Self.Context.getAsConstantArrayType(T)) return CAT->getSize().getLimitedValue(); } } return 0; } } llvm_unreachable("Unknown type trait or not implemented"); } ExprResult Sema::BuildArrayTypeTrait(ArrayTypeTrait ATT, SourceLocation KWLoc, TypeSourceInfo *TSInfo, Expr* DimExpr, SourceLocation RParen) { QualType T = TSInfo->getType(); // FIXME: This should likely be tracked as an APInt to remove any host // assumptions about the width of size_t on the target. uint64_t Value = 0; if (!T->isDependentType()) Value = EvaluateArrayTypeTrait(*this, ATT, T, DimExpr, KWLoc); // While the specification for these traits from the Embarcadero C++ // compiler's documentation says the return type is 'unsigned int', Clang // returns 'size_t'. On Windows, the primary platform for the Embarcadero // compiler, there is no difference. On several other platforms this is an // important distinction. return new (Context) ArrayTypeTraitExpr(KWLoc, ATT, TSInfo, Value, DimExpr, RParen, Context.getSizeType()); } ExprResult Sema::ActOnExpressionTrait(ExpressionTrait ET, SourceLocation KWLoc, Expr *Queried, SourceLocation RParen) { // If error parsing the expression, ignore. if (!Queried) return ExprError(); ExprResult Result = BuildExpressionTrait(ET, KWLoc, Queried, RParen); return Result; } static bool EvaluateExpressionTrait(ExpressionTrait ET, Expr *E) { switch (ET) { case ET_IsLValueExpr: return E->isLValue(); case ET_IsRValueExpr: return E->isPRValue(); } llvm_unreachable("Expression trait not covered by switch"); } ExprResult Sema::BuildExpressionTrait(ExpressionTrait ET, SourceLocation KWLoc, Expr *Queried, SourceLocation RParen) { if (Queried->isTypeDependent()) { // Delay type-checking for type-dependent expressions. } else if (Queried->hasPlaceholderType()) { ExprResult PE = CheckPlaceholderExpr(Queried); if (PE.isInvalid()) return ExprError(); return BuildExpressionTrait(ET, KWLoc, PE.get(), RParen); } bool Value = EvaluateExpressionTrait(ET, Queried); return new (Context) ExpressionTraitExpr(KWLoc, ET, Queried, Value, RParen, Context.BoolTy); } QualType Sema::CheckPointerToMemberOperands(ExprResult &LHS, ExprResult &RHS, ExprValueKind &VK, SourceLocation Loc, bool isIndirect) { assert(!LHS.get()->hasPlaceholderType() && !RHS.get()->hasPlaceholderType() && "placeholders should have been weeded out by now"); // The LHS undergoes lvalue conversions if this is ->*, and undergoes the // temporary materialization conversion otherwise. if (isIndirect) LHS = DefaultLvalueConversion(LHS.get()); else if (LHS.get()->isPRValue()) LHS = TemporaryMaterializationConversion(LHS.get()); if (LHS.isInvalid()) return QualType(); // The RHS always undergoes lvalue conversions. RHS = DefaultLvalueConversion(RHS.get()); if (RHS.isInvalid()) return QualType(); const char *OpSpelling = isIndirect ? "->*" : ".*"; // C++ 5.5p2 // The binary operator .* [p3: ->*] binds its second operand, which shall // be of type "pointer to member of T" (where T is a completely-defined // class type) [...] QualType RHSType = RHS.get()->getType(); const MemberPointerType *MemPtr = RHSType->getAs(); if (!MemPtr) { Diag(Loc, diag::err_bad_memptr_rhs) << OpSpelling << RHSType << RHS.get()->getSourceRange(); return QualType(); } QualType Class(MemPtr->getClass(), 0); // Note: C++ [expr.mptr.oper]p2-3 says that the class type into which the // member pointer points must be completely-defined. However, there is no // reason for this semantic distinction, and the rule is not enforced by // other compilers. Therefore, we do not check this property, as it is // likely to be considered a defect. // C++ 5.5p2 // [...] to its first operand, which shall be of class T or of a class of // which T is an unambiguous and accessible base class. [p3: a pointer to // such a class] QualType LHSType = LHS.get()->getType(); if (isIndirect) { if (const PointerType *Ptr = LHSType->getAs()) LHSType = Ptr->getPointeeType(); else { Diag(Loc, diag::err_bad_memptr_lhs) << OpSpelling << 1 << LHSType << FixItHint::CreateReplacement(SourceRange(Loc), ".*"); return QualType(); } } if (!Context.hasSameUnqualifiedType(Class, LHSType)) { // If we want to check the hierarchy, we need a complete type. if (RequireCompleteType(Loc, LHSType, diag::err_bad_memptr_lhs, OpSpelling, (int)isIndirect)) { return QualType(); } if (!IsDerivedFrom(Loc, LHSType, Class)) { Diag(Loc, diag::err_bad_memptr_lhs) << OpSpelling << (int)isIndirect << LHS.get()->getType(); return QualType(); } CXXCastPath BasePath; if (CheckDerivedToBaseConversion( LHSType, Class, Loc, SourceRange(LHS.get()->getBeginLoc(), RHS.get()->getEndLoc()), &BasePath)) return QualType(); // Cast LHS to type of use. QualType UseType = Context.getQualifiedType(Class, LHSType.getQualifiers()); if (isIndirect) UseType = Context.getPointerType(UseType); ExprValueKind VK = isIndirect ? VK_PRValue : LHS.get()->getValueKind(); LHS = ImpCastExprToType(LHS.get(), UseType, CK_DerivedToBase, VK, &BasePath); } if (isa(RHS.get()->IgnoreParens())) { // Diagnose use of pointer-to-member type which when used as // the functional cast in a pointer-to-member expression. Diag(Loc, diag::err_pointer_to_member_type) << isIndirect; return QualType(); } // C++ 5.5p2 // The result is an object or a function of the type specified by the // second operand. // The cv qualifiers are the union of those in the pointer and the left side, // in accordance with 5.5p5 and 5.2.5. QualType Result = MemPtr->getPointeeType(); Result = Context.getCVRQualifiedType(Result, LHSType.getCVRQualifiers()); // C++0x [expr.mptr.oper]p6: // In a .* expression whose object expression is an rvalue, the program is // ill-formed if the second operand is a pointer to member function with // ref-qualifier &. In a ->* expression or in a .* expression whose object // expression is an lvalue, the program is ill-formed if the second operand // is a pointer to member function with ref-qualifier &&. if (const FunctionProtoType *Proto = Result->getAs()) { switch (Proto->getRefQualifier()) { case RQ_None: // Do nothing break; case RQ_LValue: if (!isIndirect && !LHS.get()->Classify(Context).isLValue()) { // C++2a allows functions with ref-qualifier & if their cv-qualifier-seq // is (exactly) 'const'. if (Proto->isConst() && !Proto->isVolatile()) Diag(Loc, getLangOpts().CPlusPlus20 ? diag::warn_cxx17_compat_pointer_to_const_ref_member_on_rvalue : diag::ext_pointer_to_const_ref_member_on_rvalue); else Diag(Loc, diag::err_pointer_to_member_oper_value_classify) << RHSType << 1 << LHS.get()->getSourceRange(); } break; case RQ_RValue: if (isIndirect || !LHS.get()->Classify(Context).isRValue()) Diag(Loc, diag::err_pointer_to_member_oper_value_classify) << RHSType << 0 << LHS.get()->getSourceRange(); break; } } // C++ [expr.mptr.oper]p6: // The result of a .* expression whose second operand is a pointer // to a data member is of the same value category as its // first operand. The result of a .* expression whose second // operand is a pointer to a member function is a prvalue. The // result of an ->* expression is an lvalue if its second operand // is a pointer to data member and a prvalue otherwise. if (Result->isFunctionType()) { VK = VK_PRValue; return Context.BoundMemberTy; } else if (isIndirect) { VK = VK_LValue; } else { VK = LHS.get()->getValueKind(); } return Result; } /// Try to convert a type to another according to C++11 5.16p3. /// /// This is part of the parameter validation for the ? operator. If either /// value operand is a class type, the two operands are attempted to be /// converted to each other. This function does the conversion in one direction. /// It returns true if the program is ill-formed and has already been diagnosed /// as such. static bool TryClassUnification(Sema &Self, Expr *From, Expr *To, SourceLocation QuestionLoc, bool &HaveConversion, QualType &ToType) { HaveConversion = false; ToType = To->getType(); InitializationKind Kind = InitializationKind::CreateCopy(To->getBeginLoc(), SourceLocation()); // C++11 5.16p3 // The process for determining whether an operand expression E1 of type T1 // can be converted to match an operand expression E2 of type T2 is defined // as follows: // -- If E2 is an lvalue: E1 can be converted to match E2 if E1 can be // implicitly converted to type "lvalue reference to T2", subject to the // constraint that in the conversion the reference must bind directly to // an lvalue. // -- If E2 is an xvalue: E1 can be converted to match E2 if E1 can be // implicitly converted to the type "rvalue reference to R2", subject to // the constraint that the reference must bind directly. if (To->isGLValue()) { QualType T = Self.Context.getReferenceQualifiedType(To); InitializedEntity Entity = InitializedEntity::InitializeTemporary(T); InitializationSequence InitSeq(Self, Entity, Kind, From); if (InitSeq.isDirectReferenceBinding()) { ToType = T; HaveConversion = true; return false; } if (InitSeq.isAmbiguous()) return InitSeq.Diagnose(Self, Entity, Kind, From); } // -- If E2 is an rvalue, or if the conversion above cannot be done: // -- if E1 and E2 have class type, and the underlying class types are // the same or one is a base class of the other: QualType FTy = From->getType(); QualType TTy = To->getType(); const RecordType *FRec = FTy->getAs(); const RecordType *TRec = TTy->getAs(); bool FDerivedFromT = FRec && TRec && FRec != TRec && Self.IsDerivedFrom(QuestionLoc, FTy, TTy); if (FRec && TRec && (FRec == TRec || FDerivedFromT || Self.IsDerivedFrom(QuestionLoc, TTy, FTy))) { // E1 can be converted to match E2 if the class of T2 is the // same type as, or a base class of, the class of T1, and // [cv2 > cv1]. if (FRec == TRec || FDerivedFromT) { if (TTy.isAtLeastAsQualifiedAs(FTy)) { InitializedEntity Entity = InitializedEntity::InitializeTemporary(TTy); InitializationSequence InitSeq(Self, Entity, Kind, From); if (InitSeq) { HaveConversion = true; return false; } if (InitSeq.isAmbiguous()) return InitSeq.Diagnose(Self, Entity, Kind, From); } } return false; } // -- Otherwise: E1 can be converted to match E2 if E1 can be // implicitly converted to the type that expression E2 would have // if E2 were converted to an rvalue (or the type it has, if E2 is // an rvalue). // // This actually refers very narrowly to the lvalue-to-rvalue conversion, not // to the array-to-pointer or function-to-pointer conversions. TTy = TTy.getNonLValueExprType(Self.Context); InitializedEntity Entity = InitializedEntity::InitializeTemporary(TTy); InitializationSequence InitSeq(Self, Entity, Kind, From); HaveConversion = !InitSeq.Failed(); ToType = TTy; if (InitSeq.isAmbiguous()) return InitSeq.Diagnose(Self, Entity, Kind, From); return false; } /// Try to find a common type for two according to C++0x 5.16p5. /// /// This is part of the parameter validation for the ? operator. If either /// value operand is a class type, overload resolution is used to find a /// conversion to a common type. static bool FindConditionalOverload(Sema &Self, ExprResult &LHS, ExprResult &RHS, SourceLocation QuestionLoc) { Expr *Args[2] = { LHS.get(), RHS.get() }; OverloadCandidateSet CandidateSet(QuestionLoc, OverloadCandidateSet::CSK_Operator); Self.AddBuiltinOperatorCandidates(OO_Conditional, QuestionLoc, Args, CandidateSet); OverloadCandidateSet::iterator Best; switch (CandidateSet.BestViableFunction(Self, QuestionLoc, Best)) { case OR_Success: { // We found a match. Perform the conversions on the arguments and move on. ExprResult LHSRes = Self.PerformImplicitConversion( LHS.get(), Best->BuiltinParamTypes[0], Best->Conversions[0], Sema::AA_Converting); if (LHSRes.isInvalid()) break; LHS = LHSRes; ExprResult RHSRes = Self.PerformImplicitConversion( RHS.get(), Best->BuiltinParamTypes[1], Best->Conversions[1], Sema::AA_Converting); if (RHSRes.isInvalid()) break; RHS = RHSRes; if (Best->Function) Self.MarkFunctionReferenced(QuestionLoc, Best->Function); return false; } case OR_No_Viable_Function: // Emit a better diagnostic if one of the expressions is a null pointer // constant and the other is a pointer type. In this case, the user most // likely forgot to take the address of the other expression. if (Self.DiagnoseConditionalForNull(LHS.get(), RHS.get(), QuestionLoc)) return true; Self.Diag(QuestionLoc, diag::err_typecheck_cond_incompatible_operands) << LHS.get()->getType() << RHS.get()->getType() << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); return true; case OR_Ambiguous: Self.Diag(QuestionLoc, diag::err_conditional_ambiguous_ovl) << LHS.get()->getType() << RHS.get()->getType() << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); // FIXME: Print the possible common types by printing the return types of // the viable candidates. break; case OR_Deleted: llvm_unreachable("Conditional operator has only built-in overloads"); } return true; } /// Perform an "extended" implicit conversion as returned by /// TryClassUnification. static bool ConvertForConditional(Sema &Self, ExprResult &E, QualType T) { InitializedEntity Entity = InitializedEntity::InitializeTemporary(T); InitializationKind Kind = InitializationKind::CreateCopy(E.get()->getBeginLoc(), SourceLocation()); Expr *Arg = E.get(); InitializationSequence InitSeq(Self, Entity, Kind, Arg); ExprResult Result = InitSeq.Perform(Self, Entity, Kind, Arg); if (Result.isInvalid()) return true; E = Result; return false; } // Check the condition operand of ?: to see if it is valid for the GCC // extension. static bool isValidVectorForConditionalCondition(ASTContext &Ctx, QualType CondTy) { if (!CondTy->isVectorType() && !CondTy->isExtVectorType()) return false; const QualType EltTy = cast(CondTy.getCanonicalType())->getElementType(); assert(!EltTy->isEnumeralType() && "Vectors cant be enum types"); return EltTy->isIntegralType(Ctx); } static bool isValidSizelessVectorForConditionalCondition(ASTContext &Ctx, QualType CondTy) { if (!CondTy->isVLSTBuiltinType()) return false; const QualType EltTy = cast(CondTy.getCanonicalType())->getSveEltType(Ctx); assert(!EltTy->isEnumeralType() && "Vectors cant be enum types"); return EltTy->isIntegralType(Ctx); } QualType Sema::CheckVectorConditionalTypes(ExprResult &Cond, ExprResult &LHS, ExprResult &RHS, SourceLocation QuestionLoc) { LHS = DefaultFunctionArrayLvalueConversion(LHS.get()); RHS = DefaultFunctionArrayLvalueConversion(RHS.get()); QualType CondType = Cond.get()->getType(); const auto *CondVT = CondType->castAs(); QualType CondElementTy = CondVT->getElementType(); unsigned CondElementCount = CondVT->getNumElements(); QualType LHSType = LHS.get()->getType(); const auto *LHSVT = LHSType->getAs(); QualType RHSType = RHS.get()->getType(); const auto *RHSVT = RHSType->getAs(); QualType ResultType; if (LHSVT && RHSVT) { if (isa(CondVT) != isa(LHSVT)) { Diag(QuestionLoc, diag::err_conditional_vector_cond_result_mismatch) << /*isExtVector*/ isa(CondVT); return {}; } // If both are vector types, they must be the same type. if (!Context.hasSameType(LHSType, RHSType)) { Diag(QuestionLoc, diag::err_conditional_vector_mismatched) << LHSType << RHSType; return {}; } ResultType = Context.getCommonSugaredType(LHSType, RHSType); } else if (LHSVT || RHSVT) { ResultType = CheckVectorOperands( LHS, RHS, QuestionLoc, /*isCompAssign*/ false, /*AllowBothBool*/ true, /*AllowBoolConversions*/ false, /*AllowBoolOperation*/ true, /*ReportInvalid*/ true); if (ResultType.isNull()) return {}; } else { // Both are scalar. LHSType = LHSType.getUnqualifiedType(); RHSType = RHSType.getUnqualifiedType(); QualType ResultElementTy = Context.hasSameType(LHSType, RHSType) ? Context.getCommonSugaredType(LHSType, RHSType) : UsualArithmeticConversions(LHS, RHS, QuestionLoc, ACK_Conditional); if (ResultElementTy->isEnumeralType()) { Diag(QuestionLoc, diag::err_conditional_vector_operand_type) << ResultElementTy; return {}; } if (CondType->isExtVectorType()) ResultType = Context.getExtVectorType(ResultElementTy, CondVT->getNumElements()); else ResultType = Context.getVectorType( ResultElementTy, CondVT->getNumElements(), VectorType::GenericVector); LHS = ImpCastExprToType(LHS.get(), ResultType, CK_VectorSplat); RHS = ImpCastExprToType(RHS.get(), ResultType, CK_VectorSplat); } assert(!ResultType.isNull() && ResultType->isVectorType() && (!CondType->isExtVectorType() || ResultType->isExtVectorType()) && "Result should have been a vector type"); auto *ResultVectorTy = ResultType->castAs(); QualType ResultElementTy = ResultVectorTy->getElementType(); unsigned ResultElementCount = ResultVectorTy->getNumElements(); if (ResultElementCount != CondElementCount) { Diag(QuestionLoc, diag::err_conditional_vector_size) << CondType << ResultType; return {}; } if (Context.getTypeSize(ResultElementTy) != Context.getTypeSize(CondElementTy)) { Diag(QuestionLoc, diag::err_conditional_vector_element_size) << CondType << ResultType; return {}; } return ResultType; } QualType Sema::CheckSizelessVectorConditionalTypes(ExprResult &Cond, ExprResult &LHS, ExprResult &RHS, SourceLocation QuestionLoc) { LHS = DefaultFunctionArrayLvalueConversion(LHS.get()); RHS = DefaultFunctionArrayLvalueConversion(RHS.get()); QualType CondType = Cond.get()->getType(); const auto *CondBT = CondType->castAs(); QualType CondElementTy = CondBT->getSveEltType(Context); llvm::ElementCount CondElementCount = Context.getBuiltinVectorTypeInfo(CondBT).EC; QualType LHSType = LHS.get()->getType(); const auto *LHSBT = LHSType->isVLSTBuiltinType() ? LHSType->getAs() : nullptr; QualType RHSType = RHS.get()->getType(); const auto *RHSBT = RHSType->isVLSTBuiltinType() ? RHSType->getAs() : nullptr; QualType ResultType; if (LHSBT && RHSBT) { // If both are sizeless vector types, they must be the same type. if (!Context.hasSameType(LHSType, RHSType)) { Diag(QuestionLoc, diag::err_conditional_vector_mismatched) << LHSType << RHSType; return QualType(); } ResultType = LHSType; } else if (LHSBT || RHSBT) { ResultType = CheckSizelessVectorOperands( LHS, RHS, QuestionLoc, /*IsCompAssign*/ false, ACK_Conditional); if (ResultType.isNull()) return QualType(); } else { // Both are scalar so splat QualType ResultElementTy; LHSType = LHSType.getCanonicalType().getUnqualifiedType(); RHSType = RHSType.getCanonicalType().getUnqualifiedType(); if (Context.hasSameType(LHSType, RHSType)) ResultElementTy = LHSType; else ResultElementTy = UsualArithmeticConversions(LHS, RHS, QuestionLoc, ACK_Conditional); if (ResultElementTy->isEnumeralType()) { Diag(QuestionLoc, diag::err_conditional_vector_operand_type) << ResultElementTy; return QualType(); } ResultType = Context.getScalableVectorType( ResultElementTy, CondElementCount.getKnownMinValue()); LHS = ImpCastExprToType(LHS.get(), ResultType, CK_VectorSplat); RHS = ImpCastExprToType(RHS.get(), ResultType, CK_VectorSplat); } assert(!ResultType.isNull() && ResultType->isVLSTBuiltinType() && "Result should have been a vector type"); auto *ResultBuiltinTy = ResultType->castAs(); QualType ResultElementTy = ResultBuiltinTy->getSveEltType(Context); llvm::ElementCount ResultElementCount = Context.getBuiltinVectorTypeInfo(ResultBuiltinTy).EC; if (ResultElementCount != CondElementCount) { Diag(QuestionLoc, diag::err_conditional_vector_size) << CondType << ResultType; return QualType(); } if (Context.getTypeSize(ResultElementTy) != Context.getTypeSize(CondElementTy)) { Diag(QuestionLoc, diag::err_conditional_vector_element_size) << CondType << ResultType; return QualType(); } return ResultType; } /// Check the operands of ?: under C++ semantics. /// /// See C++ [expr.cond]. Note that LHS is never null, even for the GNU x ?: y /// extension. In this case, LHS == Cond. (But they're not aliases.) /// /// This function also implements GCC's vector extension and the /// OpenCL/ext_vector_type extension for conditionals. The vector extensions /// permit the use of a?b:c where the type of a is that of a integer vector with /// the same number of elements and size as the vectors of b and c. If one of /// either b or c is a scalar it is implicitly converted to match the type of /// the vector. Otherwise the expression is ill-formed. If both b and c are /// scalars, then b and c are checked and converted to the type of a if /// possible. /// /// The expressions are evaluated differently for GCC's and OpenCL's extensions. /// For the GCC extension, the ?: operator is evaluated as /// (a[0] != 0 ? b[0] : c[0], .. , a[n] != 0 ? b[n] : c[n]). /// For the OpenCL extensions, the ?: operator is evaluated as /// (most-significant-bit-set(a[0]) ? b[0] : c[0], .. , /// most-significant-bit-set(a[n]) ? b[n] : c[n]). QualType Sema::CXXCheckConditionalOperands(ExprResult &Cond, ExprResult &LHS, ExprResult &RHS, ExprValueKind &VK, ExprObjectKind &OK, SourceLocation QuestionLoc) { // FIXME: Handle C99's complex types, block pointers and Obj-C++ interface // pointers. // Assume r-value. VK = VK_PRValue; OK = OK_Ordinary; bool IsVectorConditional = isValidVectorForConditionalCondition(Context, Cond.get()->getType()); bool IsSizelessVectorConditional = isValidSizelessVectorForConditionalCondition(Context, Cond.get()->getType()); // C++11 [expr.cond]p1 // The first expression is contextually converted to bool. if (!Cond.get()->isTypeDependent()) { ExprResult CondRes = IsVectorConditional || IsSizelessVectorConditional ? DefaultFunctionArrayLvalueConversion(Cond.get()) : CheckCXXBooleanCondition(Cond.get()); if (CondRes.isInvalid()) return QualType(); Cond = CondRes; } else { // To implement C++, the first expression typically doesn't alter the result // type of the conditional, however the GCC compatible vector extension // changes the result type to be that of the conditional. Since we cannot // know if this is a vector extension here, delay the conversion of the // LHS/RHS below until later. return Context.DependentTy; } // Either of the arguments dependent? if (LHS.get()->isTypeDependent() || RHS.get()->isTypeDependent()) return Context.DependentTy; // C++11 [expr.cond]p2 // If either the second or the third operand has type (cv) void, ... QualType LTy = LHS.get()->getType(); QualType RTy = RHS.get()->getType(); bool LVoid = LTy->isVoidType(); bool RVoid = RTy->isVoidType(); if (LVoid || RVoid) { // ... one of the following shall hold: // -- The second or the third operand (but not both) is a (possibly // parenthesized) throw-expression; the result is of the type // and value category of the other. bool LThrow = isa(LHS.get()->IgnoreParenImpCasts()); bool RThrow = isa(RHS.get()->IgnoreParenImpCasts()); // Void expressions aren't legal in the vector-conditional expressions. if (IsVectorConditional) { SourceRange DiagLoc = LVoid ? LHS.get()->getSourceRange() : RHS.get()->getSourceRange(); bool IsThrow = LVoid ? LThrow : RThrow; Diag(DiagLoc.getBegin(), diag::err_conditional_vector_has_void) << DiagLoc << IsThrow; return QualType(); } if (LThrow != RThrow) { Expr *NonThrow = LThrow ? RHS.get() : LHS.get(); VK = NonThrow->getValueKind(); // DR (no number yet): the result is a bit-field if the // non-throw-expression operand is a bit-field. OK = NonThrow->getObjectKind(); return NonThrow->getType(); } // -- Both the second and third operands have type void; the result is of // type void and is a prvalue. if (LVoid && RVoid) return Context.getCommonSugaredType(LTy, RTy); // Neither holds, error. Diag(QuestionLoc, diag::err_conditional_void_nonvoid) << (LVoid ? RTy : LTy) << (LVoid ? 0 : 1) << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); return QualType(); } // Neither is void. if (IsVectorConditional) return CheckVectorConditionalTypes(Cond, LHS, RHS, QuestionLoc); if (IsSizelessVectorConditional) return CheckSizelessVectorConditionalTypes(Cond, LHS, RHS, QuestionLoc); // WebAssembly tables are not allowed as conditional LHS or RHS. if (LTy->isWebAssemblyTableType() || RTy->isWebAssemblyTableType()) { Diag(QuestionLoc, diag::err_wasm_table_conditional_expression) << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); return QualType(); } // C++11 [expr.cond]p3 // Otherwise, if the second and third operand have different types, and // either has (cv) class type [...] an attempt is made to convert each of // those operands to the type of the other. if (!Context.hasSameType(LTy, RTy) && (LTy->isRecordType() || RTy->isRecordType())) { // These return true if a single direction is already ambiguous. QualType L2RType, R2LType; bool HaveL2R, HaveR2L; if (TryClassUnification(*this, LHS.get(), RHS.get(), QuestionLoc, HaveL2R, L2RType)) return QualType(); if (TryClassUnification(*this, RHS.get(), LHS.get(), QuestionLoc, HaveR2L, R2LType)) return QualType(); // If both can be converted, [...] the program is ill-formed. if (HaveL2R && HaveR2L) { Diag(QuestionLoc, diag::err_conditional_ambiguous) << LTy << RTy << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); return QualType(); } // If exactly one conversion is possible, that conversion is applied to // the chosen operand and the converted operands are used in place of the // original operands for the remainder of this section. if (HaveL2R) { if (ConvertForConditional(*this, LHS, L2RType) || LHS.isInvalid()) return QualType(); LTy = LHS.get()->getType(); } else if (HaveR2L) { if (ConvertForConditional(*this, RHS, R2LType) || RHS.isInvalid()) return QualType(); RTy = RHS.get()->getType(); } } // C++11 [expr.cond]p3 // if both are glvalues of the same value category and the same type except // for cv-qualification, an attempt is made to convert each of those // operands to the type of the other. // FIXME: // Resolving a defect in P0012R1: we extend this to cover all cases where // one of the operands is reference-compatible with the other, in order // to support conditionals between functions differing in noexcept. This // will similarly cover difference in array bounds after P0388R4. // FIXME: If LTy and RTy have a composite pointer type, should we convert to // that instead? ExprValueKind LVK = LHS.get()->getValueKind(); ExprValueKind RVK = RHS.get()->getValueKind(); if (!Context.hasSameType(LTy, RTy) && LVK == RVK && LVK != VK_PRValue) { // DerivedToBase was already handled by the class-specific case above. // FIXME: Should we allow ObjC conversions here? const ReferenceConversions AllowedConversions = ReferenceConversions::Qualification | ReferenceConversions::NestedQualification | ReferenceConversions::Function; ReferenceConversions RefConv; if (CompareReferenceRelationship(QuestionLoc, LTy, RTy, &RefConv) == Ref_Compatible && !(RefConv & ~AllowedConversions) && // [...] subject to the constraint that the reference must bind // directly [...] !RHS.get()->refersToBitField() && !RHS.get()->refersToVectorElement()) { RHS = ImpCastExprToType(RHS.get(), LTy, CK_NoOp, RVK); RTy = RHS.get()->getType(); } else if (CompareReferenceRelationship(QuestionLoc, RTy, LTy, &RefConv) == Ref_Compatible && !(RefConv & ~AllowedConversions) && !LHS.get()->refersToBitField() && !LHS.get()->refersToVectorElement()) { LHS = ImpCastExprToType(LHS.get(), RTy, CK_NoOp, LVK); LTy = LHS.get()->getType(); } } // C++11 [expr.cond]p4 // If the second and third operands are glvalues of the same value // category and have the same type, the result is of that type and // value category and it is a bit-field if the second or the third // operand is a bit-field, or if both are bit-fields. // We only extend this to bitfields, not to the crazy other kinds of // l-values. bool Same = Context.hasSameType(LTy, RTy); if (Same && LVK == RVK && LVK != VK_PRValue && LHS.get()->isOrdinaryOrBitFieldObject() && RHS.get()->isOrdinaryOrBitFieldObject()) { VK = LHS.get()->getValueKind(); if (LHS.get()->getObjectKind() == OK_BitField || RHS.get()->getObjectKind() == OK_BitField) OK = OK_BitField; return Context.getCommonSugaredType(LTy, RTy); } // C++11 [expr.cond]p5 // Otherwise, the result is a prvalue. If the second and third operands // do not have the same type, and either has (cv) class type, ... if (!Same && (LTy->isRecordType() || RTy->isRecordType())) { // ... overload resolution is used to determine the conversions (if any) // to be applied to the operands. If the overload resolution fails, the // program is ill-formed. if (FindConditionalOverload(*this, LHS, RHS, QuestionLoc)) return QualType(); } // C++11 [expr.cond]p6 // Lvalue-to-rvalue, array-to-pointer, and function-to-pointer standard // conversions are performed on the second and third operands. LHS = DefaultFunctionArrayLvalueConversion(LHS.get()); RHS = DefaultFunctionArrayLvalueConversion(RHS.get()); if (LHS.isInvalid() || RHS.isInvalid()) return QualType(); LTy = LHS.get()->getType(); RTy = RHS.get()->getType(); // After those conversions, one of the following shall hold: // -- The second and third operands have the same type; the result // is of that type. If the operands have class type, the result // is a prvalue temporary of the result type, which is // copy-initialized from either the second operand or the third // operand depending on the value of the first operand. if (Context.hasSameType(LTy, RTy)) { if (LTy->isRecordType()) { // The operands have class type. Make a temporary copy. ExprResult LHSCopy = PerformCopyInitialization( InitializedEntity::InitializeTemporary(LTy), SourceLocation(), LHS); if (LHSCopy.isInvalid()) return QualType(); ExprResult RHSCopy = PerformCopyInitialization( InitializedEntity::InitializeTemporary(RTy), SourceLocation(), RHS); if (RHSCopy.isInvalid()) return QualType(); LHS = LHSCopy; RHS = RHSCopy; } return Context.getCommonSugaredType(LTy, RTy); } // Extension: conditional operator involving vector types. if (LTy->isVectorType() || RTy->isVectorType()) return CheckVectorOperands(LHS, RHS, QuestionLoc, /*isCompAssign*/ false, /*AllowBothBool*/ true, /*AllowBoolConversions*/ false, /*AllowBoolOperation*/ false, /*ReportInvalid*/ true); // -- The second and third operands have arithmetic or enumeration type; // the usual arithmetic conversions are performed to bring them to a // common type, and the result is of that type. if (LTy->isArithmeticType() && RTy->isArithmeticType()) { QualType ResTy = UsualArithmeticConversions(LHS, RHS, QuestionLoc, ACK_Conditional); if (LHS.isInvalid() || RHS.isInvalid()) return QualType(); if (ResTy.isNull()) { Diag(QuestionLoc, diag::err_typecheck_cond_incompatible_operands) << LTy << RTy << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); return QualType(); } LHS = ImpCastExprToType(LHS.get(), ResTy, PrepareScalarCast(LHS, ResTy)); RHS = ImpCastExprToType(RHS.get(), ResTy, PrepareScalarCast(RHS, ResTy)); return ResTy; } // -- The second and third operands have pointer type, or one has pointer // type and the other is a null pointer constant, or both are null // pointer constants, at least one of which is non-integral; pointer // conversions and qualification conversions are performed to bring them // to their composite pointer type. The result is of the composite // pointer type. // -- The second and third operands have pointer to member type, or one has // pointer to member type and the other is a null pointer constant; // pointer to member conversions and qualification conversions are // performed to bring them to a common type, whose cv-qualification // shall match the cv-qualification of either the second or the third // operand. The result is of the common type. QualType Composite = FindCompositePointerType(QuestionLoc, LHS, RHS); if (!Composite.isNull()) return Composite; // Similarly, attempt to find composite type of two objective-c pointers. Composite = FindCompositeObjCPointerType(LHS, RHS, QuestionLoc); if (LHS.isInvalid() || RHS.isInvalid()) return QualType(); if (!Composite.isNull()) return Composite; // Check if we are using a null with a non-pointer type. if (DiagnoseConditionalForNull(LHS.get(), RHS.get(), QuestionLoc)) return QualType(); Diag(QuestionLoc, diag::err_typecheck_cond_incompatible_operands) << LHS.get()->getType() << RHS.get()->getType() << LHS.get()->getSourceRange() << RHS.get()->getSourceRange(); return QualType(); } /// Find a merged pointer type and convert the two expressions to it. /// /// This finds the composite pointer type for \p E1 and \p E2 according to /// C++2a [expr.type]p3. It converts both expressions to this type and returns /// it. It does not emit diagnostics (FIXME: that's not true if \p ConvertArgs /// is \c true). /// /// \param Loc The location of the operator requiring these two expressions to /// be converted to the composite pointer type. /// /// \param ConvertArgs If \c false, do not convert E1 and E2 to the target type. QualType Sema::FindCompositePointerType(SourceLocation Loc, Expr *&E1, Expr *&E2, bool ConvertArgs) { assert(getLangOpts().CPlusPlus && "This function assumes C++"); // C++1z [expr]p14: // The composite pointer type of two operands p1 and p2 having types T1 // and T2 QualType T1 = E1->getType(), T2 = E2->getType(); // where at least one is a pointer or pointer to member type or // std::nullptr_t is: bool T1IsPointerLike = T1->isAnyPointerType() || T1->isMemberPointerType() || T1->isNullPtrType(); bool T2IsPointerLike = T2->isAnyPointerType() || T2->isMemberPointerType() || T2->isNullPtrType(); if (!T1IsPointerLike && !T2IsPointerLike) return QualType(); // - if both p1 and p2 are null pointer constants, std::nullptr_t; // This can't actually happen, following the standard, but we also use this // to implement the end of [expr.conv], which hits this case. // // - if either p1 or p2 is a null pointer constant, T2 or T1, respectively; if (T1IsPointerLike && E2->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull)) { if (ConvertArgs) E2 = ImpCastExprToType(E2, T1, T1->isMemberPointerType() ? CK_NullToMemberPointer : CK_NullToPointer).get(); return T1; } if (T2IsPointerLike && E1->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull)) { if (ConvertArgs) E1 = ImpCastExprToType(E1, T2, T2->isMemberPointerType() ? CK_NullToMemberPointer : CK_NullToPointer).get(); return T2; } // Now both have to be pointers or member pointers. if (!T1IsPointerLike || !T2IsPointerLike) return QualType(); assert(!T1->isNullPtrType() && !T2->isNullPtrType() && "nullptr_t should be a null pointer constant"); struct Step { enum Kind { Pointer, ObjCPointer, MemberPointer, Array } K; // Qualifiers to apply under the step kind. Qualifiers Quals; /// The class for a pointer-to-member; a constant array type with a bound /// (if any) for an array. const Type *ClassOrBound; Step(Kind K, const Type *ClassOrBound = nullptr) : K(K), ClassOrBound(ClassOrBound) {} QualType rebuild(ASTContext &Ctx, QualType T) const { T = Ctx.getQualifiedType(T, Quals); switch (K) { case Pointer: return Ctx.getPointerType(T); case MemberPointer: return Ctx.getMemberPointerType(T, ClassOrBound); case ObjCPointer: return Ctx.getObjCObjectPointerType(T); case Array: if (auto *CAT = cast_or_null(ClassOrBound)) return Ctx.getConstantArrayType(T, CAT->getSize(), nullptr, ArrayType::Normal, 0); else return Ctx.getIncompleteArrayType(T, ArrayType::Normal, 0); } llvm_unreachable("unknown step kind"); } }; SmallVector Steps; // - if T1 is "pointer to cv1 C1" and T2 is "pointer to cv2 C2", where C1 // is reference-related to C2 or C2 is reference-related to C1 (8.6.3), // the cv-combined type of T1 and T2 or the cv-combined type of T2 and T1, // respectively; // - if T1 is "pointer to member of C1 of type cv1 U1" and T2 is "pointer // to member of C2 of type cv2 U2" for some non-function type U, where // C1 is reference-related to C2 or C2 is reference-related to C1, the // cv-combined type of T2 and T1 or the cv-combined type of T1 and T2, // respectively; // - if T1 and T2 are similar types (4.5), the cv-combined type of T1 and // T2; // // Dismantle T1 and T2 to simultaneously determine whether they are similar // and to prepare to form the cv-combined type if so. QualType Composite1 = T1; QualType Composite2 = T2; unsigned NeedConstBefore = 0; while (true) { assert(!Composite1.isNull() && !Composite2.isNull()); Qualifiers Q1, Q2; Composite1 = Context.getUnqualifiedArrayType(Composite1, Q1); Composite2 = Context.getUnqualifiedArrayType(Composite2, Q2); // Top-level qualifiers are ignored. Merge at all lower levels. if (!Steps.empty()) { // Find the qualifier union: (approximately) the unique minimal set of // qualifiers that is compatible with both types. Qualifiers Quals = Qualifiers::fromCVRUMask(Q1.getCVRUQualifiers() | Q2.getCVRUQualifiers()); // Under one level of pointer or pointer-to-member, we can change to an // unambiguous compatible address space. if (Q1.getAddressSpace() == Q2.getAddressSpace()) { Quals.setAddressSpace(Q1.getAddressSpace()); } else if (Steps.size() == 1) { bool MaybeQ1 = Q1.isAddressSpaceSupersetOf(Q2); bool MaybeQ2 = Q2.isAddressSpaceSupersetOf(Q1); if (MaybeQ1 == MaybeQ2) { // Exception for ptr size address spaces. Should be able to choose // either address space during comparison. if (isPtrSizeAddressSpace(Q1.getAddressSpace()) || isPtrSizeAddressSpace(Q2.getAddressSpace())) MaybeQ1 = true; else return QualType(); // No unique best address space. } Quals.setAddressSpace(MaybeQ1 ? Q1.getAddressSpace() : Q2.getAddressSpace()); } else { return QualType(); } // FIXME: In C, we merge __strong and none to __strong at the top level. if (Q1.getObjCGCAttr() == Q2.getObjCGCAttr()) Quals.setObjCGCAttr(Q1.getObjCGCAttr()); else if (T1->isVoidPointerType() || T2->isVoidPointerType()) assert(Steps.size() == 1); else return QualType(); // Mismatched lifetime qualifiers never compatibly include each other. if (Q1.getObjCLifetime() == Q2.getObjCLifetime()) Quals.setObjCLifetime(Q1.getObjCLifetime()); else if (T1->isVoidPointerType() || T2->isVoidPointerType()) assert(Steps.size() == 1); else return QualType(); Steps.back().Quals = Quals; if (Q1 != Quals || Q2 != Quals) NeedConstBefore = Steps.size() - 1; } // FIXME: Can we unify the following with UnwrapSimilarTypes? const ArrayType *Arr1, *Arr2; if ((Arr1 = Context.getAsArrayType(Composite1)) && (Arr2 = Context.getAsArrayType(Composite2))) { auto *CAT1 = dyn_cast(Arr1); auto *CAT2 = dyn_cast(Arr2); if (CAT1 && CAT2 && CAT1->getSize() == CAT2->getSize()) { Composite1 = Arr1->getElementType(); Composite2 = Arr2->getElementType(); Steps.emplace_back(Step::Array, CAT1); continue; } bool IAT1 = isa(Arr1); bool IAT2 = isa(Arr2); if ((IAT1 && IAT2) || (getLangOpts().CPlusPlus20 && (IAT1 != IAT2) && ((bool)CAT1 != (bool)CAT2) && (Steps.empty() || Steps.back().K != Step::Array))) { // In C++20 onwards, we can unify an array of N T with an array of // a different or unknown bound. But we can't form an array whose // element type is an array of unknown bound by doing so. Composite1 = Arr1->getElementType(); Composite2 = Arr2->getElementType(); Steps.emplace_back(Step::Array); if (CAT1 || CAT2) NeedConstBefore = Steps.size(); continue; } } const PointerType *Ptr1, *Ptr2; if ((Ptr1 = Composite1->getAs()) && (Ptr2 = Composite2->getAs())) { Composite1 = Ptr1->getPointeeType(); Composite2 = Ptr2->getPointeeType(); Steps.emplace_back(Step::Pointer); continue; } const ObjCObjectPointerType *ObjPtr1, *ObjPtr2; if ((ObjPtr1 = Composite1->getAs()) && (ObjPtr2 = Composite2->getAs())) { Composite1 = ObjPtr1->getPointeeType(); Composite2 = ObjPtr2->getPointeeType(); Steps.emplace_back(Step::ObjCPointer); continue; } const MemberPointerType *MemPtr1, *MemPtr2; if ((MemPtr1 = Composite1->getAs()) && (MemPtr2 = Composite2->getAs())) { Composite1 = MemPtr1->getPointeeType(); Composite2 = MemPtr2->getPointeeType(); // At the top level, we can perform a base-to-derived pointer-to-member // conversion: // // - [...] where C1 is reference-related to C2 or C2 is // reference-related to C1 // // (Note that the only kinds of reference-relatedness in scope here are // "same type or derived from".) At any other level, the class must // exactly match. const Type *Class = nullptr; QualType Cls1(MemPtr1->getClass(), 0); QualType Cls2(MemPtr2->getClass(), 0); if (Context.hasSameType(Cls1, Cls2)) Class = MemPtr1->getClass(); else if (Steps.empty()) Class = IsDerivedFrom(Loc, Cls1, Cls2) ? MemPtr1->getClass() : IsDerivedFrom(Loc, Cls2, Cls1) ? MemPtr2->getClass() : nullptr; if (!Class) return QualType(); Steps.emplace_back(Step::MemberPointer, Class); continue; } // Special case: at the top level, we can decompose an Objective-C pointer // and a 'cv void *'. Unify the qualifiers. if (Steps.empty() && ((Composite1->isVoidPointerType() && Composite2->isObjCObjectPointerType()) || (Composite1->isObjCObjectPointerType() && Composite2->isVoidPointerType()))) { Composite1 = Composite1->getPointeeType(); Composite2 = Composite2->getPointeeType(); Steps.emplace_back(Step::Pointer); continue; } // FIXME: block pointer types? // Cannot unwrap any more types. break; } // - if T1 or T2 is "pointer to noexcept function" and the other type is // "pointer to function", where the function types are otherwise the same, // "pointer to function"; // - if T1 or T2 is "pointer to member of C1 of type function", the other // type is "pointer to member of C2 of type noexcept function", and C1 // is reference-related to C2 or C2 is reference-related to C1, where // the function types are otherwise the same, "pointer to member of C2 of // type function" or "pointer to member of C1 of type function", // respectively; // // We also support 'noreturn' here, so as a Clang extension we generalize the // above to: // // - [Clang] If T1 and T2 are both of type "pointer to function" or // "pointer to member function" and the pointee types can be unified // by a function pointer conversion, that conversion is applied // before checking the following rules. // // We've already unwrapped down to the function types, and we want to merge // rather than just convert, so do this ourselves rather than calling // IsFunctionConversion. // // FIXME: In order to match the standard wording as closely as possible, we // currently only do this under a single level of pointers. Ideally, we would // allow this in general, and set NeedConstBefore to the relevant depth on // the side(s) where we changed anything. If we permit that, we should also // consider this conversion when determining type similarity and model it as // a qualification conversion. if (Steps.size() == 1) { if (auto *FPT1 = Composite1->getAs()) { if (auto *FPT2 = Composite2->getAs()) { FunctionProtoType::ExtProtoInfo EPI1 = FPT1->getExtProtoInfo(); FunctionProtoType::ExtProtoInfo EPI2 = FPT2->getExtProtoInfo(); // The result is noreturn if both operands are. bool Noreturn = EPI1.ExtInfo.getNoReturn() && EPI2.ExtInfo.getNoReturn(); EPI1.ExtInfo = EPI1.ExtInfo.withNoReturn(Noreturn); EPI2.ExtInfo = EPI2.ExtInfo.withNoReturn(Noreturn); // The result is nothrow if both operands are. SmallVector ExceptionTypeStorage; EPI1.ExceptionSpec = EPI2.ExceptionSpec = Context.mergeExceptionSpecs( EPI1.ExceptionSpec, EPI2.ExceptionSpec, ExceptionTypeStorage, getLangOpts().CPlusPlus17); Composite1 = Context.getFunctionType(FPT1->getReturnType(), FPT1->getParamTypes(), EPI1); Composite2 = Context.getFunctionType(FPT2->getReturnType(), FPT2->getParamTypes(), EPI2); } } } // There are some more conversions we can perform under exactly one pointer. if (Steps.size() == 1 && Steps.front().K == Step::Pointer && !Context.hasSameType(Composite1, Composite2)) { // - if T1 or T2 is "pointer to cv1 void" and the other type is // "pointer to cv2 T", where T is an object type or void, // "pointer to cv12 void", where cv12 is the union of cv1 and cv2; if (Composite1->isVoidType() && Composite2->isObjectType()) Composite2 = Composite1; else if (Composite2->isVoidType() && Composite1->isObjectType()) Composite1 = Composite2; // - if T1 is "pointer to cv1 C1" and T2 is "pointer to cv2 C2", where C1 // is reference-related to C2 or C2 is reference-related to C1 (8.6.3), // the cv-combined type of T1 and T2 or the cv-combined type of T2 and // T1, respectively; // // The "similar type" handling covers all of this except for the "T1 is a // base class of T2" case in the definition of reference-related. else if (IsDerivedFrom(Loc, Composite1, Composite2)) Composite1 = Composite2; else if (IsDerivedFrom(Loc, Composite2, Composite1)) Composite2 = Composite1; } // At this point, either the inner types are the same or we have failed to // find a composite pointer type. if (!Context.hasSameType(Composite1, Composite2)) return QualType(); // Per C++ [conv.qual]p3, add 'const' to every level before the last // differing qualifier. for (unsigned I = 0; I != NeedConstBefore; ++I) Steps[I].Quals.addConst(); // Rebuild the composite type. QualType Composite = Context.getCommonSugaredType(Composite1, Composite2); for (auto &S : llvm::reverse(Steps)) Composite = S.rebuild(Context, Composite); if (ConvertArgs) { // Convert the expressions to the composite pointer type. InitializedEntity Entity = InitializedEntity::InitializeTemporary(Composite); InitializationKind Kind = InitializationKind::CreateCopy(Loc, SourceLocation()); InitializationSequence E1ToC(*this, Entity, Kind, E1); if (!E1ToC) return QualType(); InitializationSequence E2ToC(*this, Entity, Kind, E2); if (!E2ToC) return QualType(); // FIXME: Let the caller know if these fail to avoid duplicate diagnostics. ExprResult E1Result = E1ToC.Perform(*this, Entity, Kind, E1); if (E1Result.isInvalid()) return QualType(); E1 = E1Result.get(); ExprResult E2Result = E2ToC.Perform(*this, Entity, Kind, E2); if (E2Result.isInvalid()) return QualType(); E2 = E2Result.get(); } return Composite; } ExprResult Sema::MaybeBindToTemporary(Expr *E) { if (!E) return ExprError(); assert(!isa(E) && "Double-bound temporary?"); // If the result is a glvalue, we shouldn't bind it. if (E->isGLValue()) return E; // In ARC, calls that return a retainable type can return retained, // in which case we have to insert a consuming cast. if (getLangOpts().ObjCAutoRefCount && E->getType()->isObjCRetainableType()) { bool ReturnsRetained; // For actual calls, we compute this by examining the type of the // called value. if (CallExpr *Call = dyn_cast(E)) { Expr *Callee = Call->getCallee()->IgnoreParens(); QualType T = Callee->getType(); if (T == Context.BoundMemberTy) { // Handle pointer-to-members. if (BinaryOperator *BinOp = dyn_cast(Callee)) T = BinOp->getRHS()->getType(); else if (MemberExpr *Mem = dyn_cast(Callee)) T = Mem->getMemberDecl()->getType(); } if (const PointerType *Ptr = T->getAs()) T = Ptr->getPointeeType(); else if (const BlockPointerType *Ptr = T->getAs()) T = Ptr->getPointeeType(); else if (const MemberPointerType *MemPtr = T->getAs()) T = MemPtr->getPointeeType(); auto *FTy = T->castAs(); ReturnsRetained = FTy->getExtInfo().getProducesResult(); // ActOnStmtExpr arranges things so that StmtExprs of retainable // type always produce a +1 object. } else if (isa(E)) { ReturnsRetained = true; // We hit this case with the lambda conversion-to-block optimization; // we don't want any extra casts here. } else if (isa(E) && isa(cast(E)->getSubExpr())) { return E; // For message sends and property references, we try to find an // actual method. FIXME: we should infer retention by selector in // cases where we don't have an actual method. } else { ObjCMethodDecl *D = nullptr; if (ObjCMessageExpr *Send = dyn_cast(E)) { D = Send->getMethodDecl(); } else if (ObjCBoxedExpr *BoxedExpr = dyn_cast(E)) { D = BoxedExpr->getBoxingMethod(); } else if (ObjCArrayLiteral *ArrayLit = dyn_cast(E)) { // Don't do reclaims if we're using the zero-element array // constant. if (ArrayLit->getNumElements() == 0 && Context.getLangOpts().ObjCRuntime.hasEmptyCollections()) return E; D = ArrayLit->getArrayWithObjectsMethod(); } else if (ObjCDictionaryLiteral *DictLit = dyn_cast(E)) { // Don't do reclaims if we're using the zero-element dictionary // constant. if (DictLit->getNumElements() == 0 && Context.getLangOpts().ObjCRuntime.hasEmptyCollections()) return E; D = DictLit->getDictWithObjectsMethod(); } ReturnsRetained = (D && D->hasAttr()); // Don't do reclaims on performSelector calls; despite their // return type, the invoked method doesn't necessarily actually // return an object. if (!ReturnsRetained && D && D->getMethodFamily() == OMF_performSelector) return E; } // Don't reclaim an object of Class type. if (!ReturnsRetained && E->getType()->isObjCARCImplicitlyUnretainedType()) return E; Cleanup.setExprNeedsCleanups(true); CastKind ck = (ReturnsRetained ? CK_ARCConsumeObject : CK_ARCReclaimReturnedObject); return ImplicitCastExpr::Create(Context, E->getType(), ck, E, nullptr, VK_PRValue, FPOptionsOverride()); } if (E->getType().isDestructedType() == QualType::DK_nontrivial_c_struct) Cleanup.setExprNeedsCleanups(true); if (!getLangOpts().CPlusPlus) return E; // Search for the base element type (cf. ASTContext::getBaseElementType) with // a fast path for the common case that the type is directly a RecordType. const Type *T = Context.getCanonicalType(E->getType().getTypePtr()); const RecordType *RT = nullptr; while (!RT) { switch (T->getTypeClass()) { case Type::Record: RT = cast(T); break; case Type::ConstantArray: case Type::IncompleteArray: case Type::VariableArray: case Type::DependentSizedArray: T = cast(T)->getElementType().getTypePtr(); break; default: return E; } } // That should be enough to guarantee that this type is complete, if we're // not processing a decltype expression. CXXRecordDecl *RD = cast(RT->getDecl()); if (RD->isInvalidDecl() || RD->isDependentContext()) return E; bool IsDecltype = ExprEvalContexts.back().ExprContext == ExpressionEvaluationContextRecord::EK_Decltype; CXXDestructorDecl *Destructor = IsDecltype ? nullptr : LookupDestructor(RD); if (Destructor) { MarkFunctionReferenced(E->getExprLoc(), Destructor); CheckDestructorAccess(E->getExprLoc(), Destructor, PDiag(diag::err_access_dtor_temp) << E->getType()); if (DiagnoseUseOfDecl(Destructor, E->getExprLoc())) return ExprError(); // If destructor is trivial, we can avoid the extra copy. if (Destructor->isTrivial()) return E; // We need a cleanup, but we don't need to remember the temporary. Cleanup.setExprNeedsCleanups(true); } CXXTemporary *Temp = CXXTemporary::Create(Context, Destructor); CXXBindTemporaryExpr *Bind = CXXBindTemporaryExpr::Create(Context, Temp, E); if (IsDecltype) ExprEvalContexts.back().DelayedDecltypeBinds.push_back(Bind); return Bind; } ExprResult Sema::MaybeCreateExprWithCleanups(ExprResult SubExpr) { if (SubExpr.isInvalid()) return ExprError(); return MaybeCreateExprWithCleanups(SubExpr.get()); } Expr *Sema::MaybeCreateExprWithCleanups(Expr *SubExpr) { assert(SubExpr && "subexpression can't be null!"); CleanupVarDeclMarking(); unsigned FirstCleanup = ExprEvalContexts.back().NumCleanupObjects; assert(ExprCleanupObjects.size() >= FirstCleanup); assert(Cleanup.exprNeedsCleanups() || ExprCleanupObjects.size() == FirstCleanup); if (!Cleanup.exprNeedsCleanups()) return SubExpr; auto Cleanups = llvm::ArrayRef(ExprCleanupObjects.begin() + FirstCleanup, ExprCleanupObjects.size() - FirstCleanup); auto *E = ExprWithCleanups::Create( Context, SubExpr, Cleanup.cleanupsHaveSideEffects(), Cleanups); DiscardCleanupsInEvaluationContext(); return E; } Stmt *Sema::MaybeCreateStmtWithCleanups(Stmt *SubStmt) { assert(SubStmt && "sub-statement can't be null!"); CleanupVarDeclMarking(); if (!Cleanup.exprNeedsCleanups()) return SubStmt; // FIXME: In order to attach the temporaries, wrap the statement into // a StmtExpr; currently this is only used for asm statements. // This is hacky, either create a new CXXStmtWithTemporaries statement or // a new AsmStmtWithTemporaries. CompoundStmt *CompStmt = CompoundStmt::Create(Context, SubStmt, FPOptionsOverride(), SourceLocation(), SourceLocation()); Expr *E = new (Context) StmtExpr(CompStmt, Context.VoidTy, SourceLocation(), SourceLocation(), /*FIXME TemplateDepth=*/0); return MaybeCreateExprWithCleanups(E); } /// Process the expression contained within a decltype. For such expressions, /// certain semantic checks on temporaries are delayed until this point, and /// are omitted for the 'topmost' call in the decltype expression. If the /// topmost call bound a temporary, strip that temporary off the expression. ExprResult Sema::ActOnDecltypeExpression(Expr *E) { assert(ExprEvalContexts.back().ExprContext == ExpressionEvaluationContextRecord::EK_Decltype && "not in a decltype expression"); ExprResult Result = CheckPlaceholderExpr(E); if (Result.isInvalid()) return ExprError(); E = Result.get(); // C++11 [expr.call]p11: // If a function call is a prvalue of object type, // -- if the function call is either // -- the operand of a decltype-specifier, or // -- the right operand of a comma operator that is the operand of a // decltype-specifier, // a temporary object is not introduced for the prvalue. // Recursively rebuild ParenExprs and comma expressions to strip out the // outermost CXXBindTemporaryExpr, if any. if (ParenExpr *PE = dyn_cast(E)) { ExprResult SubExpr = ActOnDecltypeExpression(PE->getSubExpr()); if (SubExpr.isInvalid()) return ExprError(); if (SubExpr.get() == PE->getSubExpr()) return E; return ActOnParenExpr(PE->getLParen(), PE->getRParen(), SubExpr.get()); } if (BinaryOperator *BO = dyn_cast(E)) { if (BO->getOpcode() == BO_Comma) { ExprResult RHS = ActOnDecltypeExpression(BO->getRHS()); if (RHS.isInvalid()) return ExprError(); if (RHS.get() == BO->getRHS()) return E; return BinaryOperator::Create(Context, BO->getLHS(), RHS.get(), BO_Comma, BO->getType(), BO->getValueKind(), BO->getObjectKind(), BO->getOperatorLoc(), BO->getFPFeatures()); } } CXXBindTemporaryExpr *TopBind = dyn_cast(E); CallExpr *TopCall = TopBind ? dyn_cast(TopBind->getSubExpr()) : nullptr; if (TopCall) E = TopCall; else TopBind = nullptr; // Disable the special decltype handling now. ExprEvalContexts.back().ExprContext = ExpressionEvaluationContextRecord::EK_Other; Result = CheckUnevaluatedOperand(E); if (Result.isInvalid()) return ExprError(); E = Result.get(); // In MS mode, don't perform any extra checking of call return types within a // decltype expression. if (getLangOpts().MSVCCompat) return E; // Perform the semantic checks we delayed until this point. for (unsigned I = 0, N = ExprEvalContexts.back().DelayedDecltypeCalls.size(); I != N; ++I) { CallExpr *Call = ExprEvalContexts.back().DelayedDecltypeCalls[I]; if (Call == TopCall) continue; if (CheckCallReturnType(Call->getCallReturnType(Context), Call->getBeginLoc(), Call, Call->getDirectCallee())) return ExprError(); } // Now all relevant types are complete, check the destructors are accessible // and non-deleted, and annotate them on the temporaries. for (unsigned I = 0, N = ExprEvalContexts.back().DelayedDecltypeBinds.size(); I != N; ++I) { CXXBindTemporaryExpr *Bind = ExprEvalContexts.back().DelayedDecltypeBinds[I]; if (Bind == TopBind) continue; CXXTemporary *Temp = Bind->getTemporary(); CXXRecordDecl *RD = Bind->getType()->getBaseElementTypeUnsafe()->getAsCXXRecordDecl(); CXXDestructorDecl *Destructor = LookupDestructor(RD); Temp->setDestructor(Destructor); MarkFunctionReferenced(Bind->getExprLoc(), Destructor); CheckDestructorAccess(Bind->getExprLoc(), Destructor, PDiag(diag::err_access_dtor_temp) << Bind->getType()); if (DiagnoseUseOfDecl(Destructor, Bind->getExprLoc())) return ExprError(); // We need a cleanup, but we don't need to remember the temporary. Cleanup.setExprNeedsCleanups(true); } // Possibly strip off the top CXXBindTemporaryExpr. return E; } /// Note a set of 'operator->' functions that were used for a member access. static void noteOperatorArrows(Sema &S, ArrayRef OperatorArrows) { unsigned SkipStart = OperatorArrows.size(), SkipCount = 0; // FIXME: Make this configurable? unsigned Limit = 9; if (OperatorArrows.size() > Limit) { // Produce Limit-1 normal notes and one 'skipping' note. SkipStart = (Limit - 1) / 2 + (Limit - 1) % 2; SkipCount = OperatorArrows.size() - (Limit - 1); } for (unsigned I = 0; I < OperatorArrows.size(); /**/) { if (I == SkipStart) { S.Diag(OperatorArrows[I]->getLocation(), diag::note_operator_arrows_suppressed) << SkipCount; I += SkipCount; } else { S.Diag(OperatorArrows[I]->getLocation(), diag::note_operator_arrow_here) << OperatorArrows[I]->getCallResultType(); ++I; } } } ExprResult Sema::ActOnStartCXXMemberReference(Scope *S, Expr *Base, SourceLocation OpLoc, tok::TokenKind OpKind, ParsedType &ObjectType, bool &MayBePseudoDestructor) { // Since this might be a postfix expression, get rid of ParenListExprs. ExprResult Result = MaybeConvertParenListExprToParenExpr(S, Base); if (Result.isInvalid()) return ExprError(); Base = Result.get(); Result = CheckPlaceholderExpr(Base); if (Result.isInvalid()) return ExprError(); Base = Result.get(); QualType BaseType = Base->getType(); MayBePseudoDestructor = false; if (BaseType->isDependentType()) { // If we have a pointer to a dependent type and are using the -> operator, // the object type is the type that the pointer points to. We might still // have enough information about that type to do something useful. if (OpKind == tok::arrow) if (const PointerType *Ptr = BaseType->getAs()) BaseType = Ptr->getPointeeType(); ObjectType = ParsedType::make(BaseType); MayBePseudoDestructor = true; return Base; } // C++ [over.match.oper]p8: // [...] When operator->returns, the operator-> is applied to the value // returned, with the original second operand. if (OpKind == tok::arrow) { QualType StartingType = BaseType; bool NoArrowOperatorFound = false; bool FirstIteration = true; FunctionDecl *CurFD = dyn_cast(CurContext); // The set of types we've considered so far. llvm::SmallPtrSet CTypes; SmallVector OperatorArrows; CTypes.insert(Context.getCanonicalType(BaseType)); while (BaseType->isRecordType()) { if (OperatorArrows.size() >= getLangOpts().ArrowDepth) { Diag(OpLoc, diag::err_operator_arrow_depth_exceeded) << StartingType << getLangOpts().ArrowDepth << Base->getSourceRange(); noteOperatorArrows(*this, OperatorArrows); Diag(OpLoc, diag::note_operator_arrow_depth) << getLangOpts().ArrowDepth; return ExprError(); } Result = BuildOverloadedArrowExpr( S, Base, OpLoc, // When in a template specialization and on the first loop iteration, // potentially give the default diagnostic (with the fixit in a // separate note) instead of having the error reported back to here // and giving a diagnostic with a fixit attached to the error itself. (FirstIteration && CurFD && CurFD->isFunctionTemplateSpecialization()) ? nullptr : &NoArrowOperatorFound); if (Result.isInvalid()) { if (NoArrowOperatorFound) { if (FirstIteration) { Diag(OpLoc, diag::err_typecheck_member_reference_suggestion) << BaseType << 1 << Base->getSourceRange() << FixItHint::CreateReplacement(OpLoc, "."); OpKind = tok::period; break; } Diag(OpLoc, diag::err_typecheck_member_reference_arrow) << BaseType << Base->getSourceRange(); CallExpr *CE = dyn_cast(Base); if (Decl *CD = (CE ? CE->getCalleeDecl() : nullptr)) { Diag(CD->getBeginLoc(), diag::note_member_reference_arrow_from_operator_arrow); } } return ExprError(); } Base = Result.get(); if (CXXOperatorCallExpr *OpCall = dyn_cast(Base)) OperatorArrows.push_back(OpCall->getDirectCallee()); BaseType = Base->getType(); CanQualType CBaseType = Context.getCanonicalType(BaseType); if (!CTypes.insert(CBaseType).second) { Diag(OpLoc, diag::err_operator_arrow_circular) << StartingType; noteOperatorArrows(*this, OperatorArrows); return ExprError(); } FirstIteration = false; } if (OpKind == tok::arrow) { if (BaseType->isPointerType()) BaseType = BaseType->getPointeeType(); else if (auto *AT = Context.getAsArrayType(BaseType)) BaseType = AT->getElementType(); } } // Objective-C properties allow "." access on Objective-C pointer types, // so adjust the base type to the object type itself. if (BaseType->isObjCObjectPointerType()) BaseType = BaseType->getPointeeType(); // C++ [basic.lookup.classref]p2: // [...] If the type of the object expression is of pointer to scalar // type, the unqualified-id is looked up in the context of the complete // postfix-expression. // // This also indicates that we could be parsing a pseudo-destructor-name. // Note that Objective-C class and object types can be pseudo-destructor // expressions or normal member (ivar or property) access expressions, and // it's legal for the type to be incomplete if this is a pseudo-destructor // call. We'll do more incomplete-type checks later in the lookup process, // so just skip this check for ObjC types. if (!BaseType->isRecordType()) { ObjectType = ParsedType::make(BaseType); MayBePseudoDestructor = true; return Base; } // The object type must be complete (or dependent), or // C++11 [expr.prim.general]p3: // Unlike the object expression in other contexts, *this is not required to // be of complete type for purposes of class member access (5.2.5) outside // the member function body. if (!BaseType->isDependentType() && !isThisOutsideMemberFunctionBody(BaseType) && RequireCompleteType(OpLoc, BaseType, diag::err_incomplete_member_access)) { return CreateRecoveryExpr(Base->getBeginLoc(), Base->getEndLoc(), {Base}); } // C++ [basic.lookup.classref]p2: // If the id-expression in a class member access (5.2.5) is an // unqualified-id, and the type of the object expression is of a class // type C (or of pointer to a class type C), the unqualified-id is looked // up in the scope of class C. [...] ObjectType = ParsedType::make(BaseType); return Base; } static bool CheckArrow(Sema &S, QualType &ObjectType, Expr *&Base, tok::TokenKind &OpKind, SourceLocation OpLoc) { if (Base->hasPlaceholderType()) { ExprResult result = S.CheckPlaceholderExpr(Base); if (result.isInvalid()) return true; Base = result.get(); } ObjectType = Base->getType(); // C++ [expr.pseudo]p2: // The left-hand side of the dot operator shall be of scalar type. The // left-hand side of the arrow operator shall be of pointer to scalar type. // This scalar type is the object type. // Note that this is rather different from the normal handling for the // arrow operator. if (OpKind == tok::arrow) { // The operator requires a prvalue, so perform lvalue conversions. // Only do this if we might plausibly end with a pointer, as otherwise // this was likely to be intended to be a '.'. if (ObjectType->isPointerType() || ObjectType->isArrayType() || ObjectType->isFunctionType()) { ExprResult BaseResult = S.DefaultFunctionArrayLvalueConversion(Base); if (BaseResult.isInvalid()) return true; Base = BaseResult.get(); ObjectType = Base->getType(); } if (const PointerType *Ptr = ObjectType->getAs()) { ObjectType = Ptr->getPointeeType(); } else if (!Base->isTypeDependent()) { // The user wrote "p->" when they probably meant "p."; fix it. S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion) << ObjectType << true << FixItHint::CreateReplacement(OpLoc, "."); if (S.isSFINAEContext()) return true; OpKind = tok::period; } } return false; } /// Check if it's ok to try and recover dot pseudo destructor calls on /// pointer objects. static bool canRecoverDotPseudoDestructorCallsOnPointerObjects(Sema &SemaRef, QualType DestructedType) { // If this is a record type, check if its destructor is callable. if (auto *RD = DestructedType->getAsCXXRecordDecl()) { if (RD->hasDefinition()) if (CXXDestructorDecl *D = SemaRef.LookupDestructor(RD)) return SemaRef.CanUseDecl(D, /*TreatUnavailableAsInvalid=*/false); return false; } // Otherwise, check if it's a type for which it's valid to use a pseudo-dtor. return DestructedType->isDependentType() || DestructedType->isScalarType() || DestructedType->isVectorType(); } ExprResult Sema::BuildPseudoDestructorExpr(Expr *Base, SourceLocation OpLoc, tok::TokenKind OpKind, const CXXScopeSpec &SS, TypeSourceInfo *ScopeTypeInfo, SourceLocation CCLoc, SourceLocation TildeLoc, PseudoDestructorTypeStorage Destructed) { TypeSourceInfo *DestructedTypeInfo = Destructed.getTypeSourceInfo(); QualType ObjectType; if (CheckArrow(*this, ObjectType, Base, OpKind, OpLoc)) return ExprError(); if (!ObjectType->isDependentType() && !ObjectType->isScalarType() && !ObjectType->isVectorType()) { if (getLangOpts().MSVCCompat && ObjectType->isVoidType()) Diag(OpLoc, diag::ext_pseudo_dtor_on_void) << Base->getSourceRange(); else { Diag(OpLoc, diag::err_pseudo_dtor_base_not_scalar) << ObjectType << Base->getSourceRange(); return ExprError(); } } // C++ [expr.pseudo]p2: // [...] The cv-unqualified versions of the object type and of the type // designated by the pseudo-destructor-name shall be the same type. if (DestructedTypeInfo) { QualType DestructedType = DestructedTypeInfo->getType(); SourceLocation DestructedTypeStart = DestructedTypeInfo->getTypeLoc().getBeginLoc(); if (!DestructedType->isDependentType() && !ObjectType->isDependentType()) { if (!Context.hasSameUnqualifiedType(DestructedType, ObjectType)) { // Detect dot pseudo destructor calls on pointer objects, e.g.: // Foo *foo; // foo.~Foo(); if (OpKind == tok::period && ObjectType->isPointerType() && Context.hasSameUnqualifiedType(DestructedType, ObjectType->getPointeeType())) { auto Diagnostic = Diag(OpLoc, diag::err_typecheck_member_reference_suggestion) << ObjectType << /*IsArrow=*/0 << Base->getSourceRange(); // Issue a fixit only when the destructor is valid. if (canRecoverDotPseudoDestructorCallsOnPointerObjects( *this, DestructedType)) Diagnostic << FixItHint::CreateReplacement(OpLoc, "->"); // Recover by setting the object type to the destructed type and the // operator to '->'. ObjectType = DestructedType; OpKind = tok::arrow; } else { Diag(DestructedTypeStart, diag::err_pseudo_dtor_type_mismatch) << ObjectType << DestructedType << Base->getSourceRange() << DestructedTypeInfo->getTypeLoc().getSourceRange(); // Recover by setting the destructed type to the object type. DestructedType = ObjectType; DestructedTypeInfo = Context.getTrivialTypeSourceInfo(ObjectType, DestructedTypeStart); Destructed = PseudoDestructorTypeStorage(DestructedTypeInfo); } } else if (DestructedType.getObjCLifetime() != ObjectType.getObjCLifetime()) { if (DestructedType.getObjCLifetime() == Qualifiers::OCL_None) { // Okay: just pretend that the user provided the correctly-qualified // type. } else { Diag(DestructedTypeStart, diag::err_arc_pseudo_dtor_inconstant_quals) << ObjectType << DestructedType << Base->getSourceRange() << DestructedTypeInfo->getTypeLoc().getSourceRange(); } // Recover by setting the destructed type to the object type. DestructedType = ObjectType; DestructedTypeInfo = Context.getTrivialTypeSourceInfo(ObjectType, DestructedTypeStart); Destructed = PseudoDestructorTypeStorage(DestructedTypeInfo); } } } // C++ [expr.pseudo]p2: // [...] Furthermore, the two type-names in a pseudo-destructor-name of the // form // // ::[opt] nested-name-specifier[opt] type-name :: ~ type-name // // shall designate the same scalar type. if (ScopeTypeInfo) { QualType ScopeType = ScopeTypeInfo->getType(); if (!ScopeType->isDependentType() && !ObjectType->isDependentType() && !Context.hasSameUnqualifiedType(ScopeType, ObjectType)) { Diag(ScopeTypeInfo->getTypeLoc().getSourceRange().getBegin(), diag::err_pseudo_dtor_type_mismatch) << ObjectType << ScopeType << Base->getSourceRange() << ScopeTypeInfo->getTypeLoc().getSourceRange(); ScopeType = QualType(); ScopeTypeInfo = nullptr; } } Expr *Result = new (Context) CXXPseudoDestructorExpr(Context, Base, OpKind == tok::arrow, OpLoc, SS.getWithLocInContext(Context), ScopeTypeInfo, CCLoc, TildeLoc, Destructed); return Result; } ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base, SourceLocation OpLoc, tok::TokenKind OpKind, CXXScopeSpec &SS, UnqualifiedId &FirstTypeName, SourceLocation CCLoc, SourceLocation TildeLoc, UnqualifiedId &SecondTypeName) { assert((FirstTypeName.getKind() == UnqualifiedIdKind::IK_TemplateId || FirstTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) && "Invalid first type name in pseudo-destructor"); assert((SecondTypeName.getKind() == UnqualifiedIdKind::IK_TemplateId || SecondTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) && "Invalid second type name in pseudo-destructor"); QualType ObjectType; if (CheckArrow(*this, ObjectType, Base, OpKind, OpLoc)) return ExprError(); // Compute the object type that we should use for name lookup purposes. Only // record types and dependent types matter. ParsedType ObjectTypePtrForLookup; if (!SS.isSet()) { if (ObjectType->isRecordType()) ObjectTypePtrForLookup = ParsedType::make(ObjectType); else if (ObjectType->isDependentType()) ObjectTypePtrForLookup = ParsedType::make(Context.DependentTy); } // Convert the name of the type being destructed (following the ~) into a // type (with source-location information). QualType DestructedType; TypeSourceInfo *DestructedTypeInfo = nullptr; PseudoDestructorTypeStorage Destructed; if (SecondTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) { ParsedType T = getTypeName(*SecondTypeName.Identifier, SecondTypeName.StartLocation, S, &SS, true, false, ObjectTypePtrForLookup, /*IsCtorOrDtorName*/true); if (!T && ((SS.isSet() && !computeDeclContext(SS, false)) || (!SS.isSet() && ObjectType->isDependentType()))) { // The name of the type being destroyed is a dependent name, and we // couldn't find anything useful in scope. Just store the identifier and // it's location, and we'll perform (qualified) name lookup again at // template instantiation time. Destructed = PseudoDestructorTypeStorage(SecondTypeName.Identifier, SecondTypeName.StartLocation); } else if (!T) { Diag(SecondTypeName.StartLocation, diag::err_pseudo_dtor_destructor_non_type) << SecondTypeName.Identifier << ObjectType; if (isSFINAEContext()) return ExprError(); // Recover by assuming we had the right type all along. DestructedType = ObjectType; } else DestructedType = GetTypeFromParser(T, &DestructedTypeInfo); } else { // Resolve the template-id to a type. TemplateIdAnnotation *TemplateId = SecondTypeName.TemplateId; ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(), TemplateId->NumArgs); TypeResult T = ActOnTemplateIdType(S, SS, TemplateId->TemplateKWLoc, TemplateId->Template, TemplateId->Name, TemplateId->TemplateNameLoc, TemplateId->LAngleLoc, TemplateArgsPtr, TemplateId->RAngleLoc, /*IsCtorOrDtorName*/true); if (T.isInvalid() || !T.get()) { // Recover by assuming we had the right type all along. DestructedType = ObjectType; } else DestructedType = GetTypeFromParser(T.get(), &DestructedTypeInfo); } // If we've performed some kind of recovery, (re-)build the type source // information. if (!DestructedType.isNull()) { if (!DestructedTypeInfo) DestructedTypeInfo = Context.getTrivialTypeSourceInfo(DestructedType, SecondTypeName.StartLocation); Destructed = PseudoDestructorTypeStorage(DestructedTypeInfo); } // Convert the name of the scope type (the type prior to '::') into a type. TypeSourceInfo *ScopeTypeInfo = nullptr; QualType ScopeType; if (FirstTypeName.getKind() == UnqualifiedIdKind::IK_TemplateId || FirstTypeName.Identifier) { if (FirstTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) { ParsedType T = getTypeName(*FirstTypeName.Identifier, FirstTypeName.StartLocation, S, &SS, true, false, ObjectTypePtrForLookup, /*IsCtorOrDtorName*/true); if (!T) { Diag(FirstTypeName.StartLocation, diag::err_pseudo_dtor_destructor_non_type) << FirstTypeName.Identifier << ObjectType; if (isSFINAEContext()) return ExprError(); // Just drop this type. It's unnecessary anyway. ScopeType = QualType(); } else ScopeType = GetTypeFromParser(T, &ScopeTypeInfo); } else { // Resolve the template-id to a type. TemplateIdAnnotation *TemplateId = FirstTypeName.TemplateId; ASTTemplateArgsPtr TemplateArgsPtr(TemplateId->getTemplateArgs(), TemplateId->NumArgs); TypeResult T = ActOnTemplateIdType(S, SS, TemplateId->TemplateKWLoc, TemplateId->Template, TemplateId->Name, TemplateId->TemplateNameLoc, TemplateId->LAngleLoc, TemplateArgsPtr, TemplateId->RAngleLoc, /*IsCtorOrDtorName*/true); if (T.isInvalid() || !T.get()) { // Recover by dropping this type. ScopeType = QualType(); } else ScopeType = GetTypeFromParser(T.get(), &ScopeTypeInfo); } } if (!ScopeType.isNull() && !ScopeTypeInfo) ScopeTypeInfo = Context.getTrivialTypeSourceInfo(ScopeType, FirstTypeName.StartLocation); return BuildPseudoDestructorExpr(Base, OpLoc, OpKind, SS, ScopeTypeInfo, CCLoc, TildeLoc, Destructed); } ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base, SourceLocation OpLoc, tok::TokenKind OpKind, SourceLocation TildeLoc, const DeclSpec& DS) { QualType ObjectType; if (CheckArrow(*this, ObjectType, Base, OpKind, OpLoc)) return ExprError(); if (DS.getTypeSpecType() == DeclSpec::TST_decltype_auto) { Diag(DS.getTypeSpecTypeLoc(), diag::err_decltype_auto_invalid); return true; } QualType T = BuildDecltypeType(DS.getRepAsExpr(), /*AsUnevaluated=*/false); TypeLocBuilder TLB; DecltypeTypeLoc DecltypeTL = TLB.push(T); DecltypeTL.setDecltypeLoc(DS.getTypeSpecTypeLoc()); DecltypeTL.setRParenLoc(DS.getTypeofParensRange().getEnd()); TypeSourceInfo *DestructedTypeInfo = TLB.getTypeSourceInfo(Context, T); PseudoDestructorTypeStorage Destructed(DestructedTypeInfo); return BuildPseudoDestructorExpr(Base, OpLoc, OpKind, CXXScopeSpec(), nullptr, SourceLocation(), TildeLoc, Destructed); } ExprResult Sema::BuildCXXMemberCallExpr(Expr *E, NamedDecl *FoundDecl, CXXConversionDecl *Method, bool HadMultipleCandidates) { // Convert the expression to match the conversion function's implicit object // parameter. ExprResult Exp = PerformObjectArgumentInitialization(E, /*Qualifier=*/nullptr, FoundDecl, Method); if (Exp.isInvalid()) return true; if (Method->getParent()->isLambda() && Method->getConversionType()->isBlockPointerType()) { // This is a lambda conversion to block pointer; check if the argument // was a LambdaExpr. Expr *SubE = E; CastExpr *CE = dyn_cast(SubE); if (CE && CE->getCastKind() == CK_NoOp) SubE = CE->getSubExpr(); SubE = SubE->IgnoreParens(); if (CXXBindTemporaryExpr *BE = dyn_cast(SubE)) SubE = BE->getSubExpr(); if (isa(SubE)) { // For the conversion to block pointer on a lambda expression, we // construct a special BlockLiteral instead; this doesn't really make // a difference in ARC, but outside of ARC the resulting block literal // follows the normal lifetime rules for block literals instead of being // autoreleased. PushExpressionEvaluationContext( ExpressionEvaluationContext::PotentiallyEvaluated); ExprResult BlockExp = BuildBlockForLambdaConversion( Exp.get()->getExprLoc(), Exp.get()->getExprLoc(), Method, Exp.get()); PopExpressionEvaluationContext(); // FIXME: This note should be produced by a CodeSynthesisContext. if (BlockExp.isInvalid()) Diag(Exp.get()->getExprLoc(), diag::note_lambda_to_block_conv); return BlockExp; } } MemberExpr *ME = BuildMemberExpr(Exp.get(), /*IsArrow=*/false, SourceLocation(), NestedNameSpecifierLoc(), SourceLocation(), Method, DeclAccessPair::make(FoundDecl, FoundDecl->getAccess()), HadMultipleCandidates, DeclarationNameInfo(), Context.BoundMemberTy, VK_PRValue, OK_Ordinary); QualType ResultType = Method->getReturnType(); ExprValueKind VK = Expr::getValueKindForType(ResultType); ResultType = ResultType.getNonLValueExprType(Context); CXXMemberCallExpr *CE = CXXMemberCallExpr::Create( Context, ME, /*Args=*/{}, ResultType, VK, Exp.get()->getEndLoc(), CurFPFeatureOverrides()); if (CheckFunctionCall(Method, CE, Method->getType()->castAs())) return ExprError(); return CheckForImmediateInvocation(CE, CE->getMethodDecl()); } ExprResult Sema::BuildCXXNoexceptExpr(SourceLocation KeyLoc, Expr *Operand, SourceLocation RParen) { // If the operand is an unresolved lookup expression, the expression is ill- // formed per [over.over]p1, because overloaded function names cannot be used // without arguments except in explicit contexts. ExprResult R = CheckPlaceholderExpr(Operand); if (R.isInvalid()) return R; R = CheckUnevaluatedOperand(R.get()); if (R.isInvalid()) return ExprError(); Operand = R.get(); if (!inTemplateInstantiation() && !Operand->isInstantiationDependent() && Operand->HasSideEffects(Context, false)) { // The expression operand for noexcept is in an unevaluated expression // context, so side effects could result in unintended consequences. Diag(Operand->getExprLoc(), diag::warn_side_effects_unevaluated_context); } CanThrowResult CanThrow = canThrow(Operand); return new (Context) CXXNoexceptExpr(Context.BoolTy, Operand, CanThrow, KeyLoc, RParen); } ExprResult Sema::ActOnNoexceptExpr(SourceLocation KeyLoc, SourceLocation, Expr *Operand, SourceLocation RParen) { return BuildCXXNoexceptExpr(KeyLoc, Operand, RParen); } static void MaybeDecrementCount( Expr *E, llvm::DenseMap &RefsMinusAssignments) { DeclRefExpr *LHS = nullptr; bool IsCompoundAssign = false; bool isIncrementDecrementUnaryOp = false; if (BinaryOperator *BO = dyn_cast(E)) { if (BO->getLHS()->getType()->isDependentType() || BO->getRHS()->getType()->isDependentType()) { if (BO->getOpcode() != BO_Assign) return; } else if (!BO->isAssignmentOp()) return; else IsCompoundAssign = BO->isCompoundAssignmentOp(); LHS = dyn_cast(BO->getLHS()); } else if (CXXOperatorCallExpr *COCE = dyn_cast(E)) { if (COCE->getOperator() != OO_Equal) return; LHS = dyn_cast(COCE->getArg(0)); } else if (UnaryOperator *UO = dyn_cast(E)) { if (!UO->isIncrementDecrementOp()) return; isIncrementDecrementUnaryOp = true; LHS = dyn_cast(UO->getSubExpr()); } if (!LHS) return; VarDecl *VD = dyn_cast(LHS->getDecl()); if (!VD) return; // Don't decrement RefsMinusAssignments if volatile variable with compound // assignment (+=, ...) or increment/decrement unary operator to avoid // potential unused-but-set-variable warning. if ((IsCompoundAssign || isIncrementDecrementUnaryOp) && VD->getType().isVolatileQualified()) return; auto iter = RefsMinusAssignments.find(VD); if (iter == RefsMinusAssignments.end()) return; iter->getSecond()--; } /// Perform the conversions required for an expression used in a /// context that ignores the result. ExprResult Sema::IgnoredValueConversions(Expr *E) { MaybeDecrementCount(E, RefsMinusAssignments); if (E->hasPlaceholderType()) { ExprResult result = CheckPlaceholderExpr(E); if (result.isInvalid()) return E; E = result.get(); } // C99 6.3.2.1: // [Except in specific positions,] an lvalue that does not have // array type is converted to the value stored in the // designated object (and is no longer an lvalue). if (E->isPRValue()) { // In C, function designators (i.e. expressions of function type) // are r-values, but we still want to do function-to-pointer decay // on them. This is both technically correct and convenient for // some clients. if (!getLangOpts().CPlusPlus && E->getType()->isFunctionType()) return DefaultFunctionArrayConversion(E); return E; } if (getLangOpts().CPlusPlus) { // The C++11 standard defines the notion of a discarded-value expression; // normally, we don't need to do anything to handle it, but if it is a // volatile lvalue with a special form, we perform an lvalue-to-rvalue // conversion. if (getLangOpts().CPlusPlus11 && E->isReadIfDiscardedInCPlusPlus11()) { ExprResult Res = DefaultLvalueConversion(E); if (Res.isInvalid()) return E; E = Res.get(); } else { // Per C++2a [expr.ass]p5, a volatile assignment is not deprecated if // it occurs as a discarded-value expression. CheckUnusedVolatileAssignment(E); } // C++1z: // If the expression is a prvalue after this optional conversion, the // temporary materialization conversion is applied. // // We skip this step: IR generation is able to synthesize the storage for // itself in the aggregate case, and adding the extra node to the AST is // just clutter. // FIXME: We don't emit lifetime markers for the temporaries due to this. // FIXME: Do any other AST consumers care about this? return E; } // GCC seems to also exclude expressions of incomplete enum type. if (const EnumType *T = E->getType()->getAs()) { if (!T->getDecl()->isComplete()) { // FIXME: stupid workaround for a codegen bug! E = ImpCastExprToType(E, Context.VoidTy, CK_ToVoid).get(); return E; } } ExprResult Res = DefaultFunctionArrayLvalueConversion(E); if (Res.isInvalid()) return E; E = Res.get(); if (!E->getType()->isVoidType()) RequireCompleteType(E->getExprLoc(), E->getType(), diag::err_incomplete_type); return E; } ExprResult Sema::CheckUnevaluatedOperand(Expr *E) { // Per C++2a [expr.ass]p5, a volatile assignment is not deprecated if // it occurs as an unevaluated operand. CheckUnusedVolatileAssignment(E); return E; } // If we can unambiguously determine whether Var can never be used // in a constant expression, return true. // - if the variable and its initializer are non-dependent, then // we can unambiguously check if the variable is a constant expression. // - if the initializer is not value dependent - we can determine whether // it can be used to initialize a constant expression. If Init can not // be used to initialize a constant expression we conclude that Var can // never be a constant expression. // - FXIME: if the initializer is dependent, we can still do some analysis and // identify certain cases unambiguously as non-const by using a Visitor: // - such as those that involve odr-use of a ParmVarDecl, involve a new // delete, lambda-expr, dynamic-cast, reinterpret-cast etc... static inline bool VariableCanNeverBeAConstantExpression(VarDecl *Var, ASTContext &Context) { if (isa(Var)) return true; const VarDecl *DefVD = nullptr; // If there is no initializer - this can not be a constant expression. const Expr *Init = Var->getAnyInitializer(DefVD); if (!Init) return true; assert(DefVD); if (DefVD->isWeak()) return false; if (Var->getType()->isDependentType() || Init->isValueDependent()) { // FIXME: Teach the constant evaluator to deal with the non-dependent parts // of value-dependent expressions, and use it here to determine whether the // initializer is a potential constant expression. return false; } return !Var->isUsableInConstantExpressions(Context); } /// Check if the current lambda has any potential captures /// that must be captured by any of its enclosing lambdas that are ready to /// capture. If there is a lambda that can capture a nested /// potential-capture, go ahead and do so. Also, check to see if any /// variables are uncaptureable or do not involve an odr-use so do not /// need to be captured. static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures( Expr *const FE, LambdaScopeInfo *const CurrentLSI, Sema &S) { assert(!S.isUnevaluatedContext()); assert(S.CurContext->isDependentContext()); #ifndef NDEBUG DeclContext *DC = S.CurContext; while (DC && isa(DC)) DC = DC->getParent(); assert( CurrentLSI->CallOperator == DC && "The current call operator must be synchronized with Sema's CurContext"); #endif // NDEBUG const bool IsFullExprInstantiationDependent = FE->isInstantiationDependent(); // All the potentially captureable variables in the current nested // lambda (within a generic outer lambda), must be captured by an // outer lambda that is enclosed within a non-dependent context. CurrentLSI->visitPotentialCaptures([&](ValueDecl *Var, Expr *VarExpr) { // If the variable is clearly identified as non-odr-used and the full // expression is not instantiation dependent, only then do we not // need to check enclosing lambda's for speculative captures. // For e.g.: // Even though 'x' is not odr-used, it should be captured. // int test() { // const int x = 10; // auto L = [=](auto a) { // (void) +x + a; // }; // } if (CurrentLSI->isVariableExprMarkedAsNonODRUsed(VarExpr) && !IsFullExprInstantiationDependent) return; VarDecl *UnderlyingVar = Var->getPotentiallyDecomposedVarDecl(); if (!UnderlyingVar) return; // If we have a capture-capable lambda for the variable, go ahead and // capture the variable in that lambda (and all its enclosing lambdas). if (const std::optional Index = getStackIndexOfNearestEnclosingCaptureCapableLambda( S.FunctionScopes, Var, S)) S.MarkCaptureUsedInEnclosingContext(Var, VarExpr->getExprLoc(), *Index); const bool IsVarNeverAConstantExpression = VariableCanNeverBeAConstantExpression(UnderlyingVar, S.Context); if (!IsFullExprInstantiationDependent || IsVarNeverAConstantExpression) { // This full expression is not instantiation dependent or the variable // can not be used in a constant expression - which means // this variable must be odr-used here, so diagnose a // capture violation early, if the variable is un-captureable. // This is purely for diagnosing errors early. Otherwise, this // error would get diagnosed when the lambda becomes capture ready. QualType CaptureType, DeclRefType; SourceLocation ExprLoc = VarExpr->getExprLoc(); if (S.tryCaptureVariable(Var, ExprLoc, S.TryCapture_Implicit, /*EllipsisLoc*/ SourceLocation(), /*BuildAndDiagnose*/false, CaptureType, DeclRefType, nullptr)) { // We will never be able to capture this variable, and we need // to be able to in any and all instantiations, so diagnose it. S.tryCaptureVariable(Var, ExprLoc, S.TryCapture_Implicit, /*EllipsisLoc*/ SourceLocation(), /*BuildAndDiagnose*/true, CaptureType, DeclRefType, nullptr); } } }); // Check if 'this' needs to be captured. if (CurrentLSI->hasPotentialThisCapture()) { // If we have a capture-capable lambda for 'this', go ahead and capture // 'this' in that lambda (and all its enclosing lambdas). if (const std::optional Index = getStackIndexOfNearestEnclosingCaptureCapableLambda( S.FunctionScopes, /*0 is 'this'*/ nullptr, S)) { const unsigned FunctionScopeIndexOfCapturableLambda = *Index; S.CheckCXXThisCapture(CurrentLSI->PotentialThisCaptureLocation, /*Explicit*/ false, /*BuildAndDiagnose*/ true, &FunctionScopeIndexOfCapturableLambda); } } // Reset all the potential captures at the end of each full-expression. CurrentLSI->clearPotentialCaptures(); } static ExprResult attemptRecovery(Sema &SemaRef, const TypoCorrectionConsumer &Consumer, const TypoCorrection &TC) { LookupResult R(SemaRef, Consumer.getLookupResult().getLookupNameInfo(), Consumer.getLookupResult().getLookupKind()); const CXXScopeSpec *SS = Consumer.getSS(); CXXScopeSpec NewSS; // Use an approprate CXXScopeSpec for building the expr. if (auto *NNS = TC.getCorrectionSpecifier()) NewSS.MakeTrivial(SemaRef.Context, NNS, TC.getCorrectionRange()); else if (SS && !TC.WillReplaceSpecifier()) NewSS = *SS; if (auto *ND = TC.getFoundDecl()) { R.setLookupName(ND->getDeclName()); R.addDecl(ND); if (ND->isCXXClassMember()) { // Figure out the correct naming class to add to the LookupResult. CXXRecordDecl *Record = nullptr; if (auto *NNS = TC.getCorrectionSpecifier()) Record = NNS->getAsType()->getAsCXXRecordDecl(); if (!Record) Record = dyn_cast(ND->getDeclContext()->getRedeclContext()); if (Record) R.setNamingClass(Record); // Detect and handle the case where the decl might be an implicit // member. bool MightBeImplicitMember; if (!Consumer.isAddressOfOperand()) MightBeImplicitMember = true; else if (!NewSS.isEmpty()) MightBeImplicitMember = false; else if (R.isOverloadedResult()) MightBeImplicitMember = false; else if (R.isUnresolvableResult()) MightBeImplicitMember = true; else MightBeImplicitMember = isa(ND) || isa(ND) || isa(ND); if (MightBeImplicitMember) return SemaRef.BuildPossibleImplicitMemberExpr( NewSS, /*TemplateKWLoc*/ SourceLocation(), R, /*TemplateArgs*/ nullptr, /*S*/ nullptr); } else if (auto *Ivar = dyn_cast(ND)) { return SemaRef.LookupInObjCMethod(R, Consumer.getScope(), Ivar->getIdentifier()); } } return SemaRef.BuildDeclarationNameExpr(NewSS, R, /*NeedsADL*/ false, /*AcceptInvalidDecl*/ true); } namespace { class FindTypoExprs : public RecursiveASTVisitor { llvm::SmallSetVector &TypoExprs; public: explicit FindTypoExprs(llvm::SmallSetVector &TypoExprs) : TypoExprs(TypoExprs) {} bool VisitTypoExpr(TypoExpr *TE) { TypoExprs.insert(TE); return true; } }; class TransformTypos : public TreeTransform { typedef TreeTransform BaseTransform; VarDecl *InitDecl; // A decl to avoid as a correction because it is in the // process of being initialized. llvm::function_ref ExprFilter; llvm::SmallSetVector TypoExprs, AmbiguousTypoExprs; llvm::SmallDenseMap TransformCache; llvm::SmallDenseMap OverloadResolution; /// Emit diagnostics for all of the TypoExprs encountered. /// /// If the TypoExprs were successfully corrected, then the diagnostics should /// suggest the corrections. Otherwise the diagnostics will not suggest /// anything (having been passed an empty TypoCorrection). /// /// If we've failed to correct due to ambiguous corrections, we need to /// be sure to pass empty corrections and replacements. Otherwise it's /// possible that the Consumer has a TypoCorrection that failed to ambiguity /// and we don't want to report those diagnostics. void EmitAllDiagnostics(bool IsAmbiguous) { for (TypoExpr *TE : TypoExprs) { auto &State = SemaRef.getTypoExprState(TE); if (State.DiagHandler) { TypoCorrection TC = IsAmbiguous ? TypoCorrection() : State.Consumer->getCurrentCorrection(); ExprResult Replacement = IsAmbiguous ? ExprError() : TransformCache[TE]; // Extract the NamedDecl from the transformed TypoExpr and add it to the // TypoCorrection, replacing the existing decls. This ensures the right // NamedDecl is used in diagnostics e.g. in the case where overload // resolution was used to select one from several possible decls that // had been stored in the TypoCorrection. if (auto *ND = getDeclFromExpr( Replacement.isInvalid() ? nullptr : Replacement.get())) TC.setCorrectionDecl(ND); State.DiagHandler(TC); } SemaRef.clearDelayedTypo(TE); } } /// Try to advance the typo correction state of the first unfinished TypoExpr. /// We allow advancement of the correction stream by removing it from the /// TransformCache which allows `TransformTypoExpr` to advance during the /// next transformation attempt. /// /// Any substitution attempts for the previous TypoExprs (which must have been /// finished) will need to be retried since it's possible that they will now /// be invalid given the latest advancement. /// /// We need to be sure that we're making progress - it's possible that the /// tree is so malformed that the transform never makes it to the /// `TransformTypoExpr`. /// /// Returns true if there are any untried correction combinations. bool CheckAndAdvanceTypoExprCorrectionStreams() { for (auto *TE : TypoExprs) { auto &State = SemaRef.getTypoExprState(TE); TransformCache.erase(TE); if (!State.Consumer->hasMadeAnyCorrectionProgress()) return false; if (!State.Consumer->finished()) return true; State.Consumer->resetCorrectionStream(); } return false; } NamedDecl *getDeclFromExpr(Expr *E) { if (auto *OE = dyn_cast_or_null(E)) E = OverloadResolution[OE]; if (!E) return nullptr; if (auto *DRE = dyn_cast(E)) return DRE->getFoundDecl(); if (auto *ME = dyn_cast(E)) return ME->getFoundDecl(); // FIXME: Add any other expr types that could be seen by the delayed typo // correction TreeTransform for which the corresponding TypoCorrection could // contain multiple decls. return nullptr; } ExprResult TryTransform(Expr *E) { Sema::SFINAETrap Trap(SemaRef); ExprResult Res = TransformExpr(E); if (Trap.hasErrorOccurred() || Res.isInvalid()) return ExprError(); return ExprFilter(Res.get()); } // Since correcting typos may intoduce new TypoExprs, this function // checks for new TypoExprs and recurses if it finds any. Note that it will // only succeed if it is able to correct all typos in the given expression. ExprResult CheckForRecursiveTypos(ExprResult Res, bool &IsAmbiguous) { if (Res.isInvalid()) { return Res; } // Check to see if any new TypoExprs were created. If so, we need to recurse // to check their validity. Expr *FixedExpr = Res.get(); auto SavedTypoExprs = std::move(TypoExprs); auto SavedAmbiguousTypoExprs = std::move(AmbiguousTypoExprs); TypoExprs.clear(); AmbiguousTypoExprs.clear(); FindTypoExprs(TypoExprs).TraverseStmt(FixedExpr); if (!TypoExprs.empty()) { // Recurse to handle newly created TypoExprs. If we're not able to // handle them, discard these TypoExprs. ExprResult RecurResult = RecursiveTransformLoop(FixedExpr, IsAmbiguous); if (RecurResult.isInvalid()) { Res = ExprError(); // Recursive corrections didn't work, wipe them away and don't add // them to the TypoExprs set. Remove them from Sema's TypoExpr list // since we don't want to clear them twice. Note: it's possible the // TypoExprs were created recursively and thus won't be in our // Sema's TypoExprs - they were created in our `RecursiveTransformLoop`. auto &SemaTypoExprs = SemaRef.TypoExprs; for (auto *TE : TypoExprs) { TransformCache.erase(TE); SemaRef.clearDelayedTypo(TE); auto SI = find(SemaTypoExprs, TE); if (SI != SemaTypoExprs.end()) { SemaTypoExprs.erase(SI); } } } else { // TypoExpr is valid: add newly created TypoExprs since we were // able to correct them. Res = RecurResult; SavedTypoExprs.set_union(TypoExprs); } } TypoExprs = std::move(SavedTypoExprs); AmbiguousTypoExprs = std::move(SavedAmbiguousTypoExprs); return Res; } // Try to transform the given expression, looping through the correction // candidates with `CheckAndAdvanceTypoExprCorrectionStreams`. // // If valid ambiguous typo corrections are seen, `IsAmbiguous` is set to // true and this method immediately will return an `ExprError`. ExprResult RecursiveTransformLoop(Expr *E, bool &IsAmbiguous) { ExprResult Res; auto SavedTypoExprs = std::move(SemaRef.TypoExprs); SemaRef.TypoExprs.clear(); while (true) { Res = CheckForRecursiveTypos(TryTransform(E), IsAmbiguous); // Recursion encountered an ambiguous correction. This means that our // correction itself is ambiguous, so stop now. if (IsAmbiguous) break; // If the transform is still valid after checking for any new typos, // it's good to go. if (!Res.isInvalid()) break; // The transform was invalid, see if we have any TypoExprs with untried // correction candidates. if (!CheckAndAdvanceTypoExprCorrectionStreams()) break; } // If we found a valid result, double check to make sure it's not ambiguous. if (!IsAmbiguous && !Res.isInvalid() && !AmbiguousTypoExprs.empty()) { auto SavedTransformCache = llvm::SmallDenseMap(TransformCache); // Ensure none of the TypoExprs have multiple typo correction candidates // with the same edit length that pass all the checks and filters. while (!AmbiguousTypoExprs.empty()) { auto TE = AmbiguousTypoExprs.back(); // TryTransform itself can create new Typos, adding them to the TypoExpr map // and invalidating our TypoExprState, so always fetch it instead of storing. SemaRef.getTypoExprState(TE).Consumer->saveCurrentPosition(); TypoCorrection TC = SemaRef.getTypoExprState(TE).Consumer->peekNextCorrection(); TypoCorrection Next; do { // Fetch the next correction by erasing the typo from the cache and calling // `TryTransform` which will iterate through corrections in // `TransformTypoExpr`. TransformCache.erase(TE); ExprResult AmbigRes = CheckForRecursiveTypos(TryTransform(E), IsAmbiguous); if (!AmbigRes.isInvalid() || IsAmbiguous) { SemaRef.getTypoExprState(TE).Consumer->resetCorrectionStream(); SavedTransformCache.erase(TE); Res = ExprError(); IsAmbiguous = true; break; } } while ((Next = SemaRef.getTypoExprState(TE).Consumer->peekNextCorrection()) && Next.getEditDistance(false) == TC.getEditDistance(false)); if (IsAmbiguous) break; AmbiguousTypoExprs.remove(TE); SemaRef.getTypoExprState(TE).Consumer->restoreSavedPosition(); TransformCache[TE] = SavedTransformCache[TE]; } TransformCache = std::move(SavedTransformCache); } // Wipe away any newly created TypoExprs that we don't know about. Since we // clear any invalid TypoExprs in `CheckForRecursiveTypos`, this is only // possible if a `TypoExpr` is created during a transformation but then // fails before we can discover it. auto &SemaTypoExprs = SemaRef.TypoExprs; for (auto Iterator = SemaTypoExprs.begin(); Iterator != SemaTypoExprs.end();) { auto TE = *Iterator; auto FI = find(TypoExprs, TE); if (FI != TypoExprs.end()) { Iterator++; continue; } SemaRef.clearDelayedTypo(TE); Iterator = SemaTypoExprs.erase(Iterator); } SemaRef.TypoExprs = std::move(SavedTypoExprs); return Res; } public: TransformTypos(Sema &SemaRef, VarDecl *InitDecl, llvm::function_ref Filter) : BaseTransform(SemaRef), InitDecl(InitDecl), ExprFilter(Filter) {} ExprResult RebuildCallExpr(Expr *Callee, SourceLocation LParenLoc, MultiExprArg Args, SourceLocation RParenLoc, Expr *ExecConfig = nullptr) { auto Result = BaseTransform::RebuildCallExpr(Callee, LParenLoc, Args, RParenLoc, ExecConfig); if (auto *OE = dyn_cast(Callee)) { if (Result.isUsable()) { Expr *ResultCall = Result.get(); if (auto *BE = dyn_cast(ResultCall)) ResultCall = BE->getSubExpr(); if (auto *CE = dyn_cast(ResultCall)) OverloadResolution[OE] = CE->getCallee(); } } return Result; } ExprResult TransformLambdaExpr(LambdaExpr *E) { return Owned(E); } ExprResult TransformBlockExpr(BlockExpr *E) { return Owned(E); } ExprResult Transform(Expr *E) { bool IsAmbiguous = false; ExprResult Res = RecursiveTransformLoop(E, IsAmbiguous); if (!Res.isUsable()) FindTypoExprs(TypoExprs).TraverseStmt(E); EmitAllDiagnostics(IsAmbiguous); return Res; } ExprResult TransformTypoExpr(TypoExpr *E) { // If the TypoExpr hasn't been seen before, record it. Otherwise, return the // cached transformation result if there is one and the TypoExpr isn't the // first one that was encountered. auto &CacheEntry = TransformCache[E]; if (!TypoExprs.insert(E) && !CacheEntry.isUnset()) { return CacheEntry; } auto &State = SemaRef.getTypoExprState(E); assert(State.Consumer && "Cannot transform a cleared TypoExpr"); // For the first TypoExpr and an uncached TypoExpr, find the next likely // typo correction and return it. while (TypoCorrection TC = State.Consumer->getNextCorrection()) { if (InitDecl && TC.getFoundDecl() == InitDecl) continue; // FIXME: If we would typo-correct to an invalid declaration, it's // probably best to just suppress all errors from this typo correction. ExprResult NE = State.RecoveryHandler ? State.RecoveryHandler(SemaRef, E, TC) : attemptRecovery(SemaRef, *State.Consumer, TC); if (!NE.isInvalid()) { // Check whether there may be a second viable correction with the same // edit distance; if so, remember this TypoExpr may have an ambiguous // correction so it can be more thoroughly vetted later. TypoCorrection Next; if ((Next = State.Consumer->peekNextCorrection()) && Next.getEditDistance(false) == TC.getEditDistance(false)) { AmbiguousTypoExprs.insert(E); } else { AmbiguousTypoExprs.remove(E); } assert(!NE.isUnset() && "Typo was transformed into a valid-but-null ExprResult"); return CacheEntry = NE; } } return CacheEntry = ExprError(); } }; } ExprResult Sema::CorrectDelayedTyposInExpr(Expr *E, VarDecl *InitDecl, bool RecoverUncorrectedTypos, llvm::function_ref Filter) { // If the current evaluation context indicates there are uncorrected typos // and the current expression isn't guaranteed to not have typos, try to // resolve any TypoExpr nodes that might be in the expression. if (E && !ExprEvalContexts.empty() && ExprEvalContexts.back().NumTypos && (E->isTypeDependent() || E->isValueDependent() || E->isInstantiationDependent())) { auto TyposResolved = DelayedTypos.size(); auto Result = TransformTypos(*this, InitDecl, Filter).Transform(E); TyposResolved -= DelayedTypos.size(); if (Result.isInvalid() || Result.get() != E) { ExprEvalContexts.back().NumTypos -= TyposResolved; if (Result.isInvalid() && RecoverUncorrectedTypos) { struct TyposReplace : TreeTransform { TyposReplace(Sema &SemaRef) : TreeTransform(SemaRef) {} ExprResult TransformTypoExpr(clang::TypoExpr *E) { return this->SemaRef.CreateRecoveryExpr(E->getBeginLoc(), E->getEndLoc(), {}); } } TT(*this); return TT.TransformExpr(E); } return Result; } assert(TyposResolved == 0 && "Corrected typo but got same Expr back?"); } return E; } ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC, bool DiscardedValue, bool IsConstexpr, bool IsTemplateArgument) { ExprResult FullExpr = FE; if (!FullExpr.get()) return ExprError(); if (!IsTemplateArgument && DiagnoseUnexpandedParameterPack(FullExpr.get())) return ExprError(); if (DiscardedValue) { // Top-level expressions default to 'id' when we're in a debugger. if (getLangOpts().DebuggerCastResultToId && FullExpr.get()->getType() == Context.UnknownAnyTy) { FullExpr = forceUnknownAnyToType(FullExpr.get(), Context.getObjCIdType()); if (FullExpr.isInvalid()) return ExprError(); } FullExpr = CheckPlaceholderExpr(FullExpr.get()); if (FullExpr.isInvalid()) return ExprError(); FullExpr = IgnoredValueConversions(FullExpr.get()); if (FullExpr.isInvalid()) return ExprError(); DiagnoseUnusedExprResult(FullExpr.get(), diag::warn_unused_expr); } FullExpr = CorrectDelayedTyposInExpr(FullExpr.get(), /*InitDecl=*/nullptr, /*RecoverUncorrectedTypos=*/true); if (FullExpr.isInvalid()) return ExprError(); CheckCompletedExpr(FullExpr.get(), CC, IsConstexpr); // At the end of this full expression (which could be a deeply nested // lambda), if there is a potential capture within the nested lambda, // have the outer capture-able lambda try and capture it. // Consider the following code: // void f(int, int); // void f(const int&, double); // void foo() { // const int x = 10, y = 20; // auto L = [=](auto a) { // auto M = [=](auto b) { // f(x, b); <-- requires x to be captured by L and M // f(y, a); <-- requires y to be captured by L, but not all Ms // }; // }; // } // FIXME: Also consider what happens for something like this that involves // the gnu-extension statement-expressions or even lambda-init-captures: // void f() { // const int n = 0; // auto L = [&](auto a) { // +n + ({ 0; a; }); // }; // } // // Here, we see +n, and then the full-expression 0; ends, so we don't // capture n (and instead remove it from our list of potential captures), // and then the full-expression +n + ({ 0; }); ends, but it's too late // for us to see that we need to capture n after all. LambdaScopeInfo *const CurrentLSI = getCurLambda(/*IgnoreCapturedRegions=*/true); // FIXME: PR 17877 showed that getCurLambda() can return a valid pointer // even if CurContext is not a lambda call operator. Refer to that Bug Report // for an example of the code that might cause this asynchrony. // By ensuring we are in the context of a lambda's call operator // we can fix the bug (we only need to check whether we need to capture // if we are within a lambda's body); but per the comments in that // PR, a proper fix would entail : // "Alternative suggestion: // - Add to Sema an integer holding the smallest (outermost) scope // index that we are *lexically* within, and save/restore/set to // FunctionScopes.size() in InstantiatingTemplate's // constructor/destructor. // - Teach the handful of places that iterate over FunctionScopes to // stop at the outermost enclosing lexical scope." DeclContext *DC = CurContext; while (DC && isa(DC)) DC = DC->getParent(); const bool IsInLambdaDeclContext = isLambdaCallOperator(DC); if (IsInLambdaDeclContext && CurrentLSI && CurrentLSI->hasPotentialCaptures() && !FullExpr.isInvalid()) CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures(FE, CurrentLSI, *this); return MaybeCreateExprWithCleanups(FullExpr); } StmtResult Sema::ActOnFinishFullStmt(Stmt *FullStmt) { if (!FullStmt) return StmtError(); return MaybeCreateStmtWithCleanups(FullStmt); } Sema::IfExistsResult Sema::CheckMicrosoftIfExistsSymbol(Scope *S, CXXScopeSpec &SS, const DeclarationNameInfo &TargetNameInfo) { DeclarationName TargetName = TargetNameInfo.getName(); if (!TargetName) return IER_DoesNotExist; // If the name itself is dependent, then the result is dependent. if (TargetName.isDependentName()) return IER_Dependent; // Do the redeclaration lookup in the current scope. LookupResult R(*this, TargetNameInfo, Sema::LookupAnyName, Sema::NotForRedeclaration); LookupParsedName(R, S, &SS); R.suppressDiagnostics(); switch (R.getResultKind()) { case LookupResult::Found: case LookupResult::FoundOverloaded: case LookupResult::FoundUnresolvedValue: case LookupResult::Ambiguous: return IER_Exists; case LookupResult::NotFound: return IER_DoesNotExist; case LookupResult::NotFoundInCurrentInstantiation: return IER_Dependent; } llvm_unreachable("Invalid LookupResult Kind!"); } Sema::IfExistsResult Sema::CheckMicrosoftIfExistsSymbol(Scope *S, SourceLocation KeywordLoc, bool IsIfExists, CXXScopeSpec &SS, UnqualifiedId &Name) { DeclarationNameInfo TargetNameInfo = GetNameFromUnqualifiedId(Name); // Check for an unexpanded parameter pack. auto UPPC = IsIfExists ? UPPC_IfExists : UPPC_IfNotExists; if (DiagnoseUnexpandedParameterPack(SS, UPPC) || DiagnoseUnexpandedParameterPack(TargetNameInfo, UPPC)) return IER_Error; return CheckMicrosoftIfExistsSymbol(S, SS, TargetNameInfo); } concepts::Requirement *Sema::ActOnSimpleRequirement(Expr *E) { return BuildExprRequirement(E, /*IsSimple=*/true, /*NoexceptLoc=*/SourceLocation(), /*ReturnTypeRequirement=*/{}); } concepts::Requirement * Sema::ActOnTypeRequirement(SourceLocation TypenameKWLoc, CXXScopeSpec &SS, SourceLocation NameLoc, IdentifierInfo *TypeName, TemplateIdAnnotation *TemplateId) { assert(((!TypeName && TemplateId) || (TypeName && !TemplateId)) && "Exactly one of TypeName and TemplateId must be specified."); TypeSourceInfo *TSI = nullptr; if (TypeName) { QualType T = CheckTypenameType(ETK_Typename, TypenameKWLoc, SS.getWithLocInContext(Context), *TypeName, NameLoc, &TSI, /*DeducedTSTContext=*/false); if (T.isNull()) return nullptr; } else { ASTTemplateArgsPtr ArgsPtr(TemplateId->getTemplateArgs(), TemplateId->NumArgs); TypeResult T = ActOnTypenameType(CurScope, TypenameKWLoc, SS, TemplateId->TemplateKWLoc, TemplateId->Template, TemplateId->Name, TemplateId->TemplateNameLoc, TemplateId->LAngleLoc, ArgsPtr, TemplateId->RAngleLoc); if (T.isInvalid()) return nullptr; if (GetTypeFromParser(T.get(), &TSI).isNull()) return nullptr; } return BuildTypeRequirement(TSI); } concepts::Requirement * Sema::ActOnCompoundRequirement(Expr *E, SourceLocation NoexceptLoc) { return BuildExprRequirement(E, /*IsSimple=*/false, NoexceptLoc, /*ReturnTypeRequirement=*/{}); } concepts::Requirement * Sema::ActOnCompoundRequirement( Expr *E, SourceLocation NoexceptLoc, CXXScopeSpec &SS, TemplateIdAnnotation *TypeConstraint, unsigned Depth) { // C++2a [expr.prim.req.compound] p1.3.3 // [..] the expression is deduced against an invented function template // F [...] F is a void function template with a single type template // parameter T declared with the constrained-parameter. Form a new // cv-qualifier-seq cv by taking the union of const and volatile specifiers // around the constrained-parameter. F has a single parameter whose // type-specifier is cv T followed by the abstract-declarator. [...] // // The cv part is done in the calling function - we get the concept with // arguments and the abstract declarator with the correct CV qualification and // have to synthesize T and the single parameter of F. auto &II = Context.Idents.get("expr-type"); auto *TParam = TemplateTypeParmDecl::Create(Context, CurContext, SourceLocation(), SourceLocation(), Depth, /*Index=*/0, &II, /*Typename=*/true, /*ParameterPack=*/false, /*HasTypeConstraint=*/true); if (BuildTypeConstraint(SS, TypeConstraint, TParam, /*EllipsisLoc=*/SourceLocation(), /*AllowUnexpandedPack=*/true)) // Just produce a requirement with no type requirements. return BuildExprRequirement(E, /*IsSimple=*/false, NoexceptLoc, {}); auto *TPL = TemplateParameterList::Create(Context, SourceLocation(), SourceLocation(), ArrayRef(TParam), SourceLocation(), /*RequiresClause=*/nullptr); return BuildExprRequirement( E, /*IsSimple=*/false, NoexceptLoc, concepts::ExprRequirement::ReturnTypeRequirement(TPL)); } concepts::ExprRequirement * Sema::BuildExprRequirement( Expr *E, bool IsSimple, SourceLocation NoexceptLoc, concepts::ExprRequirement::ReturnTypeRequirement ReturnTypeRequirement) { auto Status = concepts::ExprRequirement::SS_Satisfied; ConceptSpecializationExpr *SubstitutedConstraintExpr = nullptr; if (E->isInstantiationDependent() || ReturnTypeRequirement.isDependent()) Status = concepts::ExprRequirement::SS_Dependent; else if (NoexceptLoc.isValid() && canThrow(E) == CanThrowResult::CT_Can) Status = concepts::ExprRequirement::SS_NoexceptNotMet; else if (ReturnTypeRequirement.isSubstitutionFailure()) Status = concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure; else if (ReturnTypeRequirement.isTypeConstraint()) { // C++2a [expr.prim.req]p1.3.3 // The immediately-declared constraint ([temp]) of decltype((E)) shall // be satisfied. TemplateParameterList *TPL = ReturnTypeRequirement.getTypeConstraintTemplateParameterList(); QualType MatchedType = Context.getReferenceQualifiedType(E).getCanonicalType(); llvm::SmallVector Args; Args.push_back(TemplateArgument(MatchedType)); auto *Param = cast(TPL->getParam(0)); TemplateArgumentList TAL(TemplateArgumentList::OnStack, Args); MultiLevelTemplateArgumentList MLTAL(Param, TAL.asArray(), /*Final=*/false); MLTAL.addOuterRetainedLevels(TPL->getDepth()); - Expr *IDC = Param->getTypeConstraint()->getImmediatelyDeclaredConstraint(); + const TypeConstraint *TC = Param->getTypeConstraint(); + assert(TC && "Type Constraint cannot be null here"); + auto *IDC = TC->getImmediatelyDeclaredConstraint(); + assert(IDC && "ImmediatelyDeclaredConstraint can't be null here."); ExprResult Constraint = SubstExpr(IDC, MLTAL); if (Constraint.isInvalid()) { - Status = concepts::ExprRequirement::SS_ExprSubstitutionFailure; - } else { - SubstitutedConstraintExpr = - cast(Constraint.get()); - if (!SubstitutedConstraintExpr->isSatisfied()) - Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied; - } + return new (Context) concepts::ExprRequirement( + concepts::createSubstDiagAt(*this, IDC->getExprLoc(), + [&](llvm::raw_ostream &OS) { + IDC->printPretty(OS, /*Helper=*/nullptr, + getPrintingPolicy()); + }), + IsSimple, NoexceptLoc, ReturnTypeRequirement); + } + SubstitutedConstraintExpr = + cast(Constraint.get()); + if (!SubstitutedConstraintExpr->isSatisfied()) + Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied; } return new (Context) concepts::ExprRequirement(E, IsSimple, NoexceptLoc, ReturnTypeRequirement, Status, SubstitutedConstraintExpr); } concepts::ExprRequirement * Sema::BuildExprRequirement( concepts::Requirement::SubstitutionDiagnostic *ExprSubstitutionDiagnostic, bool IsSimple, SourceLocation NoexceptLoc, concepts::ExprRequirement::ReturnTypeRequirement ReturnTypeRequirement) { return new (Context) concepts::ExprRequirement(ExprSubstitutionDiagnostic, IsSimple, NoexceptLoc, ReturnTypeRequirement); } concepts::TypeRequirement * Sema::BuildTypeRequirement(TypeSourceInfo *Type) { return new (Context) concepts::TypeRequirement(Type); } concepts::TypeRequirement * Sema::BuildTypeRequirement( concepts::Requirement::SubstitutionDiagnostic *SubstDiag) { return new (Context) concepts::TypeRequirement(SubstDiag); } concepts::Requirement *Sema::ActOnNestedRequirement(Expr *Constraint) { return BuildNestedRequirement(Constraint); } concepts::NestedRequirement * Sema::BuildNestedRequirement(Expr *Constraint) { ConstraintSatisfaction Satisfaction; if (!Constraint->isInstantiationDependent() && CheckConstraintSatisfaction(nullptr, {Constraint}, /*TemplateArgs=*/{}, Constraint->getSourceRange(), Satisfaction)) return nullptr; return new (Context) concepts::NestedRequirement(Context, Constraint, Satisfaction); } concepts::NestedRequirement * Sema::BuildNestedRequirement(StringRef InvalidConstraintEntity, const ASTConstraintSatisfaction &Satisfaction) { return new (Context) concepts::NestedRequirement( InvalidConstraintEntity, ASTConstraintSatisfaction::Rebuild(Context, Satisfaction)); } RequiresExprBodyDecl * Sema::ActOnStartRequiresExpr(SourceLocation RequiresKWLoc, ArrayRef LocalParameters, Scope *BodyScope) { assert(BodyScope); RequiresExprBodyDecl *Body = RequiresExprBodyDecl::Create(Context, CurContext, RequiresKWLoc); PushDeclContext(BodyScope, Body); for (ParmVarDecl *Param : LocalParameters) { if (Param->hasDefaultArg()) // C++2a [expr.prim.req] p4 // [...] A local parameter of a requires-expression shall not have a // default argument. [...] Diag(Param->getDefaultArgRange().getBegin(), diag::err_requires_expr_local_parameter_default_argument); // Ignore default argument and move on Param->setDeclContext(Body); // If this has an identifier, add it to the scope stack. if (Param->getIdentifier()) { CheckShadow(BodyScope, Param); PushOnScopeChains(Param, BodyScope); } } return Body; } void Sema::ActOnFinishRequiresExpr() { assert(CurContext && "DeclContext imbalance!"); CurContext = CurContext->getLexicalParent(); assert(CurContext && "Popped translation unit!"); } ExprResult Sema::ActOnRequiresExpr(SourceLocation RequiresKWLoc, RequiresExprBodyDecl *Body, ArrayRef LocalParameters, ArrayRef Requirements, SourceLocation ClosingBraceLoc) { auto *RE = RequiresExpr::Create(Context, RequiresKWLoc, Body, LocalParameters, Requirements, ClosingBraceLoc); if (DiagnoseUnexpandedParameterPackInRequiresExpr(RE)) return ExprError(); return RE; } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 8702e2ca3a1b..394006a57747 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1,4260 +1,4273 @@ //===------- SemaTemplateInstantiate.cpp - C++ Template Instantiation ------===/ // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //===----------------------------------------------------------------------===/ // // This file implements C++ template instantiation. // //===----------------------------------------------------------------------===/ #include "TreeTransform.h" #include "clang/AST/ASTConcept.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/ASTMutationListener.h" #include "clang/AST/DeclBase.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" #include "clang/AST/ExprConcepts.h" #include "clang/AST/PrettyDeclStackTrace.h" #include "clang/AST/Type.h" #include "clang/AST/TypeVisitor.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/Stack.h" #include "clang/Basic/TargetInfo.h" #include "clang/Sema/DeclSpec.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Sema.h" #include "clang/Sema/SemaConcept.h" #include "clang/Sema/SemaInternal.h" #include "clang/Sema/Template.h" #include "clang/Sema/TemplateDeduction.h" #include "clang/Sema/TemplateInstCallback.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" #include using namespace clang; using namespace sema; //===----------------------------------------------------------------------===/ // Template Instantiation Support //===----------------------------------------------------------------------===/ namespace { namespace TemplateInstArgsHelpers { struct Response { const Decl *NextDecl = nullptr; bool IsDone = false; bool ClearRelativeToPrimary = true; static Response Done() { Response R; R.IsDone = true; return R; } static Response ChangeDecl(const Decl *ND) { Response R; R.NextDecl = ND; return R; } static Response ChangeDecl(const DeclContext *Ctx) { Response R; R.NextDecl = Decl::castFromDeclContext(Ctx); return R; } static Response UseNextDecl(const Decl *CurDecl) { return ChangeDecl(CurDecl->getDeclContext()); } static Response DontClearRelativeToPrimaryNextDecl(const Decl *CurDecl) { Response R = Response::UseNextDecl(CurDecl); R.ClearRelativeToPrimary = false; return R; } }; // Add template arguments from a variable template instantiation. Response HandleVarTemplateSpec(const VarTemplateSpecializationDecl *VarTemplSpec, MultiLevelTemplateArgumentList &Result, bool SkipForSpecialization) { // For a class-scope explicit specialization, there are no template arguments // at this level, but there may be enclosing template arguments. if (VarTemplSpec->isClassScopeExplicitSpecialization()) return Response::DontClearRelativeToPrimaryNextDecl(VarTemplSpec); // We're done when we hit an explicit specialization. if (VarTemplSpec->getSpecializationKind() == TSK_ExplicitSpecialization && !isa(VarTemplSpec)) return Response::Done(); // If this variable template specialization was instantiated from a // specialized member that is a variable template, we're done. assert(VarTemplSpec->getSpecializedTemplate() && "No variable template?"); llvm::PointerUnion Specialized = VarTemplSpec->getSpecializedTemplateOrPartial(); if (VarTemplatePartialSpecializationDecl *Partial = Specialized.dyn_cast()) { if (!SkipForSpecialization) Result.addOuterTemplateArguments( Partial, VarTemplSpec->getTemplateInstantiationArgs().asArray(), /*Final=*/false); if (Partial->isMemberSpecialization()) return Response::Done(); } else { VarTemplateDecl *Tmpl = Specialized.get(); if (!SkipForSpecialization) Result.addOuterTemplateArguments( Tmpl, VarTemplSpec->getTemplateInstantiationArgs().asArray(), /*Final=*/false); if (Tmpl->isMemberSpecialization()) return Response::Done(); } return Response::DontClearRelativeToPrimaryNextDecl(VarTemplSpec); } // If we have a template template parameter with translation unit context, // then we're performing substitution into a default template argument of // this template template parameter before we've constructed the template // that will own this template template parameter. In this case, we // use empty template parameter lists for all of the outer templates // to avoid performing any substitutions. Response HandleDefaultTempArgIntoTempTempParam(const TemplateTemplateParmDecl *TTP, MultiLevelTemplateArgumentList &Result) { for (unsigned I = 0, N = TTP->getDepth() + 1; I != N; ++I) Result.addOuterTemplateArguments(std::nullopt); return Response::Done(); } Response HandlePartialClassTemplateSpec( const ClassTemplatePartialSpecializationDecl *PartialClassTemplSpec, MultiLevelTemplateArgumentList &Result, bool SkipForSpecialization) { if (!SkipForSpecialization) Result.addOuterRetainedLevels(PartialClassTemplSpec->getTemplateDepth()); return Response::Done(); } // Add template arguments from a class template instantiation. Response HandleClassTemplateSpec(const ClassTemplateSpecializationDecl *ClassTemplSpec, MultiLevelTemplateArgumentList &Result, bool SkipForSpecialization) { if (!ClassTemplSpec->isClassScopeExplicitSpecialization()) { // We're done when we hit an explicit specialization. if (ClassTemplSpec->getSpecializationKind() == TSK_ExplicitSpecialization && !isa(ClassTemplSpec)) return Response::Done(); if (!SkipForSpecialization) Result.addOuterTemplateArguments( const_cast(ClassTemplSpec), ClassTemplSpec->getTemplateInstantiationArgs().asArray(), /*Final=*/false); // If this class template specialization was instantiated from a // specialized member that is a class template, we're done. assert(ClassTemplSpec->getSpecializedTemplate() && "No class template?"); if (ClassTemplSpec->getSpecializedTemplate()->isMemberSpecialization()) return Response::Done(); // If this was instantiated from a partial template specialization, we need // to get the next level of declaration context from the partial // specialization, as the ClassTemplateSpecializationDecl's // DeclContext/LexicalDeclContext will be for the primary template. if (auto *InstFromPartialTempl = ClassTemplSpec->getSpecializedTemplateOrPartial() .dyn_cast()) return Response::ChangeDecl(InstFromPartialTempl->getLexicalDeclContext()); } return Response::UseNextDecl(ClassTemplSpec); } Response HandleFunction(const FunctionDecl *Function, MultiLevelTemplateArgumentList &Result, const FunctionDecl *Pattern, bool RelativeToPrimary, bool ForConstraintInstantiation) { // Add template arguments from a function template specialization. if (!RelativeToPrimary && Function->getTemplateSpecializationKindForInstantiation() == TSK_ExplicitSpecialization) return Response::Done(); if (!RelativeToPrimary && Function->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) { // This is an implicit instantiation of an explicit specialization. We // don't get any template arguments from this function but might get // some from an enclosing template. return Response::UseNextDecl(Function); } else if (const TemplateArgumentList *TemplateArgs = Function->getTemplateSpecializationArgs()) { // Add the template arguments for this specialization. Result.addOuterTemplateArguments(const_cast(Function), TemplateArgs->asArray(), /*Final=*/false); // If this function was instantiated from a specialized member that is // a function template, we're done. assert(Function->getPrimaryTemplate() && "No function template?"); if (Function->getPrimaryTemplate()->isMemberSpecialization()) return Response::Done(); // If this function is a generic lambda specialization, we are done. if (!ForConstraintInstantiation && isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function)) return Response::Done(); } else if (Function->getDescribedFunctionTemplate()) { assert( (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && "Outer template not instantiated?"); } // If this is a friend or local declaration and it declares an entity at // namespace scope, take arguments from its lexical parent // instead of its semantic parent, unless of course the pattern we're // instantiating actually comes from the file's context! if ((Function->getFriendObjectKind() || Function->isLocalExternDecl()) && Function->getNonTransparentDeclContext()->isFileContext() && (!Pattern || !Pattern->getLexicalDeclContext()->isFileContext())) { return Response::ChangeDecl(Function->getLexicalDeclContext()); } return Response::UseNextDecl(Function); } Response HandleFunctionTemplateDecl(const FunctionTemplateDecl *FTD, MultiLevelTemplateArgumentList &Result) { if (!isa(FTD->getDeclContext())) { NestedNameSpecifier *NNS = FTD->getTemplatedDecl()->getQualifier(); const Type *Ty; const TemplateSpecializationType *TSTy; if (NNS && (Ty = NNS->getAsType()) && (TSTy = Ty->getAs())) Result.addOuterTemplateArguments(const_cast(FTD), TSTy->template_arguments(), /*Final=*/false); } return Response::ChangeDecl(FTD->getLexicalDeclContext()); } Response HandleRecordDecl(const CXXRecordDecl *Rec, MultiLevelTemplateArgumentList &Result, ASTContext &Context, bool ForConstraintInstantiation) { if (ClassTemplateDecl *ClassTemplate = Rec->getDescribedClassTemplate()) { assert( (ForConstraintInstantiation || Result.getNumSubstitutedLevels() == 0) && "Outer template not instantiated?"); if (ClassTemplate->isMemberSpecialization()) return Response::Done(); if (ForConstraintInstantiation) Result.addOuterTemplateArguments(const_cast(Rec), ClassTemplate->getInjectedTemplateArgs(), /*Final=*/false); } if (const MemberSpecializationInfo *MSInfo = Rec->getMemberSpecializationInfo()) if (MSInfo->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) return Response::Done(); bool IsFriend = Rec->getFriendObjectKind() || (Rec->getDescribedClassTemplate() && Rec->getDescribedClassTemplate()->getFriendObjectKind()); if (ForConstraintInstantiation && IsFriend && Rec->getNonTransparentDeclContext()->isFileContext()) { return Response::ChangeDecl(Rec->getLexicalDeclContext()); } // This is to make sure we pick up the VarTemplateSpecializationDecl that this // lambda is defined inside of. if (Rec->isLambda()) if (const Decl *LCD = Rec->getLambdaContextDecl()) return Response::ChangeDecl(LCD); return Response::UseNextDecl(Rec); } Response HandleImplicitConceptSpecializationDecl( const ImplicitConceptSpecializationDecl *CSD, MultiLevelTemplateArgumentList &Result) { Result.addOuterTemplateArguments( const_cast(CSD), CSD->getTemplateArguments(), /*Final=*/false); return Response::UseNextDecl(CSD); } Response HandleGenericDeclContext(const Decl *CurDecl) { return Response::UseNextDecl(CurDecl); } } // namespace TemplateInstArgsHelpers } // namespace /// Retrieve the template argument list(s) that should be used to /// instantiate the definition of the given declaration. /// /// \param ND the declaration for which we are computing template instantiation /// arguments. /// /// \param Innermost if non-NULL, specifies a template argument list for the /// template declaration passed as ND. /// /// \param RelativeToPrimary true if we should get the template /// arguments relative to the primary template, even when we're /// dealing with a specialization. This is only relevant for function /// template specializations. /// /// \param Pattern If non-NULL, indicates the pattern from which we will be /// instantiating the definition of the given declaration, \p ND. This is /// used to determine the proper set of template instantiation arguments for /// friend function template specializations. /// /// \param ForConstraintInstantiation when collecting arguments, /// ForConstraintInstantiation indicates we should continue looking when /// encountering a lambda generic call operator, and continue looking for /// arguments on an enclosing class template. MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( const NamedDecl *ND, bool Final, const TemplateArgumentList *Innermost, bool RelativeToPrimary, const FunctionDecl *Pattern, bool ForConstraintInstantiation, bool SkipForSpecialization) { assert(ND && "Can't find arguments for a decl if one isn't provided"); // Accumulate the set of template argument lists in this structure. MultiLevelTemplateArgumentList Result; using namespace TemplateInstArgsHelpers; const Decl *CurDecl = ND; if (Innermost) { Result.addOuterTemplateArguments(const_cast(ND), Innermost->asArray(), Final); CurDecl = Response::UseNextDecl(ND).NextDecl; } while (!CurDecl->isFileContextDecl()) { Response R; if (const auto *VarTemplSpec = dyn_cast(CurDecl)) { R = HandleVarTemplateSpec(VarTemplSpec, Result, SkipForSpecialization); } else if (const auto *PartialClassTemplSpec = dyn_cast(CurDecl)) { R = HandlePartialClassTemplateSpec(PartialClassTemplSpec, Result, SkipForSpecialization); } else if (const auto *ClassTemplSpec = dyn_cast(CurDecl)) { R = HandleClassTemplateSpec(ClassTemplSpec, Result, SkipForSpecialization); } else if (const auto *Function = dyn_cast(CurDecl)) { R = HandleFunction(Function, Result, Pattern, RelativeToPrimary, ForConstraintInstantiation); } else if (const auto *Rec = dyn_cast(CurDecl)) { R = HandleRecordDecl(Rec, Result, Context, ForConstraintInstantiation); } else if (const auto *CSD = dyn_cast(CurDecl)) { R = HandleImplicitConceptSpecializationDecl(CSD, Result); } else if (const auto *FTD = dyn_cast(CurDecl)) { R = HandleFunctionTemplateDecl(FTD, Result); } else if (!isa(CurDecl)) { R = Response::DontClearRelativeToPrimaryNextDecl(CurDecl); if (CurDecl->getDeclContext()->isTranslationUnit()) { if (const auto *TTP = dyn_cast(CurDecl)) { R = HandleDefaultTempArgIntoTempTempParam(TTP, Result); } } } else { R = HandleGenericDeclContext(CurDecl); } if (R.IsDone) return Result; if (R.ClearRelativeToPrimary) RelativeToPrimary = false; assert(R.NextDecl); CurDecl = R.NextDecl; } return Result; } bool Sema::CodeSynthesisContext::isInstantiationRecord() const { switch (Kind) { case TemplateInstantiation: case ExceptionSpecInstantiation: case DefaultTemplateArgumentInstantiation: case DefaultFunctionArgumentInstantiation: case ExplicitTemplateArgumentSubstitution: case DeducedTemplateArgumentSubstitution: case PriorTemplateArgumentSubstitution: case ConstraintsCheck: case NestedRequirementConstraintsCheck: return true; case RequirementInstantiation: case RequirementParameterInstantiation: case DefaultTemplateArgumentChecking: case DeclaringSpecialMember: case DeclaringImplicitEqualityComparison: case DefiningSynthesizedFunction: case ExceptionSpecEvaluation: case ConstraintSubstitution: case ParameterMappingSubstitution: case ConstraintNormalization: case RewritingOperatorAsSpaceship: case InitializingStructuredBinding: case MarkingClassDllexported: case BuildingBuiltinDumpStructCall: case LambdaExpressionSubstitution: case BuildingDeductionGuides: return false; // This function should never be called when Kind's value is Memoization. case Memoization: break; } llvm_unreachable("Invalid SynthesisKind!"); } Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, CodeSynthesisContext::SynthesisKind Kind, SourceLocation PointOfInstantiation, SourceRange InstantiationRange, Decl *Entity, NamedDecl *Template, ArrayRef TemplateArgs, sema::TemplateDeductionInfo *DeductionInfo) : SemaRef(SemaRef) { // Don't allow further instantiation if a fatal error and an uncompilable // error have occurred. Any diagnostics we might have raised will not be // visible, and we do not need to construct a correct AST. if (SemaRef.Diags.hasFatalErrorOccurred() && SemaRef.hasUncompilableErrorOccurred()) { Invalid = true; return; } Invalid = CheckInstantiationDepth(PointOfInstantiation, InstantiationRange); if (!Invalid) { CodeSynthesisContext Inst; Inst.Kind = Kind; Inst.PointOfInstantiation = PointOfInstantiation; Inst.Entity = Entity; Inst.Template = Template; Inst.TemplateArgs = TemplateArgs.data(); Inst.NumTemplateArgs = TemplateArgs.size(); Inst.DeductionInfo = DeductionInfo; Inst.InstantiationRange = InstantiationRange; SemaRef.pushCodeSynthesisContext(Inst); AlreadyInstantiating = !Inst.Entity ? false : !SemaRef.InstantiatingSpecializations .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind}) .second; atTemplateBegin(SemaRef.TemplateInstCallbacks, SemaRef, Inst); } } Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, Decl *Entity, SourceRange InstantiationRange) : InstantiatingTemplate(SemaRef, CodeSynthesisContext::TemplateInstantiation, PointOfInstantiation, InstantiationRange, Entity) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, FunctionDecl *Entity, ExceptionSpecification, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::ExceptionSpecInstantiation, PointOfInstantiation, InstantiationRange, Entity) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateParameter Param, TemplateDecl *Template, ArrayRef TemplateArgs, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::DefaultTemplateArgumentInstantiation, PointOfInstantiation, InstantiationRange, getAsNamedDecl(Param), Template, TemplateArgs) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, FunctionTemplateDecl *FunctionTemplate, ArrayRef TemplateArgs, CodeSynthesisContext::SynthesisKind Kind, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate(SemaRef, Kind, PointOfInstantiation, InstantiationRange, FunctionTemplate, nullptr, TemplateArgs, &DeductionInfo) { assert( Kind == CodeSynthesisContext::ExplicitTemplateArgumentSubstitution || Kind == CodeSynthesisContext::DeducedTemplateArgumentSubstitution); } Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateDecl *Template, ArrayRef TemplateArgs, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution, PointOfInstantiation, InstantiationRange, Template, nullptr, TemplateArgs, &DeductionInfo) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, ClassTemplatePartialSpecializationDecl *PartialSpec, ArrayRef TemplateArgs, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution, PointOfInstantiation, InstantiationRange, PartialSpec, nullptr, TemplateArgs, &DeductionInfo) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, VarTemplatePartialSpecializationDecl *PartialSpec, ArrayRef TemplateArgs, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution, PointOfInstantiation, InstantiationRange, PartialSpec, nullptr, TemplateArgs, &DeductionInfo) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, ParmVarDecl *Param, ArrayRef TemplateArgs, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::DefaultFunctionArgumentInstantiation, PointOfInstantiation, InstantiationRange, Param, nullptr, TemplateArgs) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, NamedDecl *Template, NonTypeTemplateParmDecl *Param, ArrayRef TemplateArgs, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::PriorTemplateArgumentSubstitution, PointOfInstantiation, InstantiationRange, Param, Template, TemplateArgs) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, NamedDecl *Template, TemplateTemplateParmDecl *Param, ArrayRef TemplateArgs, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::PriorTemplateArgumentSubstitution, PointOfInstantiation, InstantiationRange, Param, Template, TemplateArgs) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateDecl *Template, NamedDecl *Param, ArrayRef TemplateArgs, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::DefaultTemplateArgumentChecking, PointOfInstantiation, InstantiationRange, Param, Template, TemplateArgs) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, concepts::Requirement *Req, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::RequirementInstantiation, PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr, /*Template=*/nullptr, /*TemplateArgs=*/std::nullopt, &DeductionInfo) { } Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, concepts::NestedRequirement *Req, ConstraintsCheck, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::NestedRequirementConstraintsCheck, PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr, /*Template=*/nullptr, /*TemplateArgs=*/std::nullopt) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, const RequiresExpr *RE, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::RequirementParameterInstantiation, PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr, /*Template=*/nullptr, /*TemplateArgs=*/std::nullopt, &DeductionInfo) { } Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, ConstraintsCheck, NamedDecl *Template, ArrayRef TemplateArgs, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::ConstraintsCheck, PointOfInstantiation, InstantiationRange, Template, nullptr, TemplateArgs) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, ConstraintSubstitution, NamedDecl *Template, sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::ConstraintSubstitution, PointOfInstantiation, InstantiationRange, Template, nullptr, {}, &DeductionInfo) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, ConstraintNormalization, NamedDecl *Template, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::ConstraintNormalization, PointOfInstantiation, InstantiationRange, Template) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, ParameterMappingSubstitution, NamedDecl *Template, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::ParameterMappingSubstitution, PointOfInstantiation, InstantiationRange, Template) {} Sema::InstantiatingTemplate::InstantiatingTemplate( Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateDecl *Entity, BuildingDeductionGuidesTag, SourceRange InstantiationRange) : InstantiatingTemplate( SemaRef, CodeSynthesisContext::BuildingDeductionGuides, PointOfInstantiation, InstantiationRange, Entity) {} void Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) { Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext; InNonInstantiationSFINAEContext = false; CodeSynthesisContexts.push_back(Ctx); if (!Ctx.isInstantiationRecord()) ++NonInstantiationEntries; // Check to see if we're low on stack space. We can't do anything about this // from here, but we can at least warn the user. if (isStackNearlyExhausted()) warnStackExhausted(Ctx.PointOfInstantiation); } void Sema::popCodeSynthesisContext() { auto &Active = CodeSynthesisContexts.back(); if (!Active.isInstantiationRecord()) { assert(NonInstantiationEntries > 0); --NonInstantiationEntries; } InNonInstantiationSFINAEContext = Active.SavedInNonInstantiationSFINAEContext; // Name lookup no longer looks in this template's defining module. assert(CodeSynthesisContexts.size() >= CodeSynthesisContextLookupModules.size() && "forgot to remove a lookup module for a template instantiation"); if (CodeSynthesisContexts.size() == CodeSynthesisContextLookupModules.size()) { if (Module *M = CodeSynthesisContextLookupModules.back()) LookupModulesCache.erase(M); CodeSynthesisContextLookupModules.pop_back(); } // If we've left the code synthesis context for the current context stack, // stop remembering that we've emitted that stack. if (CodeSynthesisContexts.size() == LastEmittedCodeSynthesisContextDepth) LastEmittedCodeSynthesisContextDepth = 0; CodeSynthesisContexts.pop_back(); } void Sema::InstantiatingTemplate::Clear() { if (!Invalid) { if (!AlreadyInstantiating) { auto &Active = SemaRef.CodeSynthesisContexts.back(); if (Active.Entity) SemaRef.InstantiatingSpecializations.erase( {Active.Entity->getCanonicalDecl(), Active.Kind}); } atTemplateEnd(SemaRef.TemplateInstCallbacks, SemaRef, SemaRef.CodeSynthesisContexts.back()); SemaRef.popCodeSynthesisContext(); Invalid = true; } } static std::string convertCallArgsToString(Sema &S, llvm::ArrayRef Args) { std::string Result; llvm::raw_string_ostream OS(Result); llvm::ListSeparator Comma; for (const Expr *Arg : Args) { OS << Comma; Arg->IgnoreParens()->printPretty(OS, nullptr, S.Context.getPrintingPolicy()); } return Result; } bool Sema::InstantiatingTemplate::CheckInstantiationDepth( SourceLocation PointOfInstantiation, SourceRange InstantiationRange) { assert(SemaRef.NonInstantiationEntries <= SemaRef.CodeSynthesisContexts.size()); if ((SemaRef.CodeSynthesisContexts.size() - SemaRef.NonInstantiationEntries) <= SemaRef.getLangOpts().InstantiationDepth) return false; SemaRef.Diag(PointOfInstantiation, diag::err_template_recursion_depth_exceeded) << SemaRef.getLangOpts().InstantiationDepth << InstantiationRange; SemaRef.Diag(PointOfInstantiation, diag::note_template_recursion_depth) << SemaRef.getLangOpts().InstantiationDepth; return true; } /// Prints the current instantiation stack through a series of /// notes. void Sema::PrintInstantiationStack() { // Determine which template instantiations to skip, if any. unsigned SkipStart = CodeSynthesisContexts.size(), SkipEnd = SkipStart; unsigned Limit = Diags.getTemplateBacktraceLimit(); if (Limit && Limit < CodeSynthesisContexts.size()) { SkipStart = Limit / 2 + Limit % 2; SkipEnd = CodeSynthesisContexts.size() - Limit / 2; } // FIXME: In all of these cases, we need to show the template arguments unsigned InstantiationIdx = 0; for (SmallVectorImpl::reverse_iterator Active = CodeSynthesisContexts.rbegin(), ActiveEnd = CodeSynthesisContexts.rend(); Active != ActiveEnd; ++Active, ++InstantiationIdx) { // Skip this instantiation? if (InstantiationIdx >= SkipStart && InstantiationIdx < SkipEnd) { if (InstantiationIdx == SkipStart) { // Note that we're skipping instantiations. Diags.Report(Active->PointOfInstantiation, diag::note_instantiation_contexts_suppressed) << unsigned(CodeSynthesisContexts.size() - Limit); } continue; } switch (Active->Kind) { case CodeSynthesisContext::TemplateInstantiation: { Decl *D = Active->Entity; if (CXXRecordDecl *Record = dyn_cast(D)) { unsigned DiagID = diag::note_template_member_class_here; if (isa(Record)) DiagID = diag::note_template_class_instantiation_here; Diags.Report(Active->PointOfInstantiation, DiagID) << Record << Active->InstantiationRange; } else if (FunctionDecl *Function = dyn_cast(D)) { unsigned DiagID; if (Function->getPrimaryTemplate()) DiagID = diag::note_function_template_spec_here; else DiagID = diag::note_template_member_function_here; Diags.Report(Active->PointOfInstantiation, DiagID) << Function << Active->InstantiationRange; } else if (VarDecl *VD = dyn_cast(D)) { Diags.Report(Active->PointOfInstantiation, VD->isStaticDataMember()? diag::note_template_static_data_member_def_here : diag::note_template_variable_def_here) << VD << Active->InstantiationRange; } else if (EnumDecl *ED = dyn_cast(D)) { Diags.Report(Active->PointOfInstantiation, diag::note_template_enum_def_here) << ED << Active->InstantiationRange; } else if (FieldDecl *FD = dyn_cast(D)) { Diags.Report(Active->PointOfInstantiation, diag::note_template_nsdmi_here) << FD << Active->InstantiationRange; } else { Diags.Report(Active->PointOfInstantiation, diag::note_template_type_alias_instantiation_here) << cast(D) << Active->InstantiationRange; } break; } case CodeSynthesisContext::DefaultTemplateArgumentInstantiation: { TemplateDecl *Template = cast(Active->Template); SmallString<128> TemplateArgsStr; llvm::raw_svector_ostream OS(TemplateArgsStr); Template->printName(OS, getPrintingPolicy()); printTemplateArgumentList(OS, Active->template_arguments(), getPrintingPolicy()); Diags.Report(Active->PointOfInstantiation, diag::note_default_arg_instantiation_here) << OS.str() << Active->InstantiationRange; break; } case CodeSynthesisContext::ExplicitTemplateArgumentSubstitution: { FunctionTemplateDecl *FnTmpl = cast(Active->Entity); Diags.Report(Active->PointOfInstantiation, diag::note_explicit_template_arg_substitution_here) << FnTmpl << getTemplateArgumentBindingsText(FnTmpl->getTemplateParameters(), Active->TemplateArgs, Active->NumTemplateArgs) << Active->InstantiationRange; break; } case CodeSynthesisContext::DeducedTemplateArgumentSubstitution: { if (FunctionTemplateDecl *FnTmpl = dyn_cast(Active->Entity)) { Diags.Report(Active->PointOfInstantiation, diag::note_function_template_deduction_instantiation_here) << FnTmpl << getTemplateArgumentBindingsText(FnTmpl->getTemplateParameters(), Active->TemplateArgs, Active->NumTemplateArgs) << Active->InstantiationRange; } else { bool IsVar = isa(Active->Entity) || isa(Active->Entity); bool IsTemplate = false; TemplateParameterList *Params; if (auto *D = dyn_cast(Active->Entity)) { IsTemplate = true; Params = D->getTemplateParameters(); } else if (auto *D = dyn_cast( Active->Entity)) { Params = D->getTemplateParameters(); } else if (auto *D = dyn_cast( Active->Entity)) { Params = D->getTemplateParameters(); } else { llvm_unreachable("unexpected template kind"); } Diags.Report(Active->PointOfInstantiation, diag::note_deduced_template_arg_substitution_here) << IsVar << IsTemplate << cast(Active->Entity) << getTemplateArgumentBindingsText(Params, Active->TemplateArgs, Active->NumTemplateArgs) << Active->InstantiationRange; } break; } case CodeSynthesisContext::DefaultFunctionArgumentInstantiation: { ParmVarDecl *Param = cast(Active->Entity); FunctionDecl *FD = cast(Param->getDeclContext()); SmallString<128> TemplateArgsStr; llvm::raw_svector_ostream OS(TemplateArgsStr); FD->printName(OS, getPrintingPolicy()); printTemplateArgumentList(OS, Active->template_arguments(), getPrintingPolicy()); Diags.Report(Active->PointOfInstantiation, diag::note_default_function_arg_instantiation_here) << OS.str() << Active->InstantiationRange; break; } case CodeSynthesisContext::PriorTemplateArgumentSubstitution: { NamedDecl *Parm = cast(Active->Entity); std::string Name; if (!Parm->getName().empty()) Name = std::string(" '") + Parm->getName().str() + "'"; TemplateParameterList *TemplateParams = nullptr; if (TemplateDecl *Template = dyn_cast(Active->Template)) TemplateParams = Template->getTemplateParameters(); else TemplateParams = cast(Active->Template) ->getTemplateParameters(); Diags.Report(Active->PointOfInstantiation, diag::note_prior_template_arg_substitution) << isa(Parm) << Name << getTemplateArgumentBindingsText(TemplateParams, Active->TemplateArgs, Active->NumTemplateArgs) << Active->InstantiationRange; break; } case CodeSynthesisContext::DefaultTemplateArgumentChecking: { TemplateParameterList *TemplateParams = nullptr; if (TemplateDecl *Template = dyn_cast(Active->Template)) TemplateParams = Template->getTemplateParameters(); else TemplateParams = cast(Active->Template) ->getTemplateParameters(); Diags.Report(Active->PointOfInstantiation, diag::note_template_default_arg_checking) << getTemplateArgumentBindingsText(TemplateParams, Active->TemplateArgs, Active->NumTemplateArgs) << Active->InstantiationRange; break; } case CodeSynthesisContext::ExceptionSpecEvaluation: Diags.Report(Active->PointOfInstantiation, diag::note_evaluating_exception_spec_here) << cast(Active->Entity); break; case CodeSynthesisContext::ExceptionSpecInstantiation: Diags.Report(Active->PointOfInstantiation, diag::note_template_exception_spec_instantiation_here) << cast(Active->Entity) << Active->InstantiationRange; break; case CodeSynthesisContext::RequirementInstantiation: Diags.Report(Active->PointOfInstantiation, diag::note_template_requirement_instantiation_here) << Active->InstantiationRange; break; case CodeSynthesisContext::RequirementParameterInstantiation: Diags.Report(Active->PointOfInstantiation, diag::note_template_requirement_params_instantiation_here) << Active->InstantiationRange; break; case CodeSynthesisContext::NestedRequirementConstraintsCheck: Diags.Report(Active->PointOfInstantiation, diag::note_nested_requirement_here) << Active->InstantiationRange; break; case CodeSynthesisContext::DeclaringSpecialMember: Diags.Report(Active->PointOfInstantiation, diag::note_in_declaration_of_implicit_special_member) << cast(Active->Entity) << Active->SpecialMember; break; case CodeSynthesisContext::DeclaringImplicitEqualityComparison: Diags.Report(Active->Entity->getLocation(), diag::note_in_declaration_of_implicit_equality_comparison); break; case CodeSynthesisContext::DefiningSynthesizedFunction: { // FIXME: For synthesized functions that are not defaulted, // produce a note. auto *FD = dyn_cast(Active->Entity); DefaultedFunctionKind DFK = FD ? getDefaultedFunctionKind(FD) : DefaultedFunctionKind(); if (DFK.isSpecialMember()) { auto *MD = cast(FD); Diags.Report(Active->PointOfInstantiation, diag::note_member_synthesized_at) << MD->isExplicitlyDefaulted() << DFK.asSpecialMember() << Context.getTagDeclType(MD->getParent()); } else if (DFK.isComparison()) { QualType RecordType = FD->getParamDecl(0) ->getType() .getNonReferenceType() .getUnqualifiedType(); Diags.Report(Active->PointOfInstantiation, diag::note_comparison_synthesized_at) << (int)DFK.asComparison() << RecordType; } break; } case CodeSynthesisContext::RewritingOperatorAsSpaceship: Diags.Report(Active->Entity->getLocation(), diag::note_rewriting_operator_as_spaceship); break; case CodeSynthesisContext::InitializingStructuredBinding: Diags.Report(Active->PointOfInstantiation, diag::note_in_binding_decl_init) << cast(Active->Entity); break; case CodeSynthesisContext::MarkingClassDllexported: Diags.Report(Active->PointOfInstantiation, diag::note_due_to_dllexported_class) << cast(Active->Entity) << !getLangOpts().CPlusPlus11; break; case CodeSynthesisContext::BuildingBuiltinDumpStructCall: Diags.Report(Active->PointOfInstantiation, diag::note_building_builtin_dump_struct_call) << convertCallArgsToString( *this, llvm::ArrayRef(Active->CallArgs, Active->NumCallArgs)); break; case CodeSynthesisContext::Memoization: break; case CodeSynthesisContext::LambdaExpressionSubstitution: Diags.Report(Active->PointOfInstantiation, diag::note_lambda_substitution_here); break; case CodeSynthesisContext::ConstraintsCheck: { unsigned DiagID = 0; if (!Active->Entity) { Diags.Report(Active->PointOfInstantiation, diag::note_nested_requirement_here) << Active->InstantiationRange; break; } if (isa(Active->Entity)) DiagID = diag::note_concept_specialization_here; else if (isa(Active->Entity)) DiagID = diag::note_checking_constraints_for_template_id_here; else if (isa(Active->Entity)) DiagID = diag::note_checking_constraints_for_var_spec_id_here; else if (isa(Active->Entity)) DiagID = diag::note_checking_constraints_for_class_spec_id_here; else { assert(isa(Active->Entity)); DiagID = diag::note_checking_constraints_for_function_here; } SmallString<128> TemplateArgsStr; llvm::raw_svector_ostream OS(TemplateArgsStr); cast(Active->Entity)->printName(OS, getPrintingPolicy()); if (!isa(Active->Entity)) { printTemplateArgumentList(OS, Active->template_arguments(), getPrintingPolicy()); } Diags.Report(Active->PointOfInstantiation, DiagID) << OS.str() << Active->InstantiationRange; break; } case CodeSynthesisContext::ConstraintSubstitution: Diags.Report(Active->PointOfInstantiation, diag::note_constraint_substitution_here) << Active->InstantiationRange; break; case CodeSynthesisContext::ConstraintNormalization: Diags.Report(Active->PointOfInstantiation, diag::note_constraint_normalization_here) << cast(Active->Entity)->getName() << Active->InstantiationRange; break; case CodeSynthesisContext::ParameterMappingSubstitution: Diags.Report(Active->PointOfInstantiation, diag::note_parameter_mapping_substitution_here) << Active->InstantiationRange; break; case CodeSynthesisContext::BuildingDeductionGuides: llvm_unreachable("unexpected deduction guide in instantiation stack"); } } } std::optional Sema::isSFINAEContext() const { if (InNonInstantiationSFINAEContext) return std::optional(nullptr); bool SawLambdaSubstitution = false; for (SmallVectorImpl::const_reverse_iterator Active = CodeSynthesisContexts.rbegin(), ActiveEnd = CodeSynthesisContexts.rend(); Active != ActiveEnd; ++Active) { switch (Active->Kind) { case CodeSynthesisContext::TemplateInstantiation: // An instantiation of an alias template may or may not be a SFINAE // context, depending on what else is on the stack. if (isa(Active->Entity)) break; [[fallthrough]]; case CodeSynthesisContext::DefaultFunctionArgumentInstantiation: case CodeSynthesisContext::ExceptionSpecInstantiation: case CodeSynthesisContext::ConstraintsCheck: case CodeSynthesisContext::ParameterMappingSubstitution: case CodeSynthesisContext::ConstraintNormalization: case CodeSynthesisContext::NestedRequirementConstraintsCheck: // This is a template instantiation, so there is no SFINAE. return std::nullopt; case CodeSynthesisContext::LambdaExpressionSubstitution: // [temp.deduct]p9 // A lambda-expression appearing in a function type or a template // parameter is not considered part of the immediate context for the // purposes of template argument deduction. // We need to check parents. SawLambdaSubstitution = true; break; case CodeSynthesisContext::DefaultTemplateArgumentInstantiation: case CodeSynthesisContext::PriorTemplateArgumentSubstitution: case CodeSynthesisContext::DefaultTemplateArgumentChecking: case CodeSynthesisContext::RewritingOperatorAsSpaceship: // A default template argument instantiation and substitution into // template parameters with arguments for prior parameters may or may // not be a SFINAE context; look further up the stack. break; case CodeSynthesisContext::ExplicitTemplateArgumentSubstitution: case CodeSynthesisContext::DeducedTemplateArgumentSubstitution: // We're either substituting explicitly-specified template arguments, // deduced template arguments. SFINAE applies unless we are in a lambda // expression, see [temp.deduct]p9. if (SawLambdaSubstitution) return std::nullopt; [[fallthrough]]; case CodeSynthesisContext::ConstraintSubstitution: case CodeSynthesisContext::RequirementInstantiation: case CodeSynthesisContext::RequirementParameterInstantiation: // SFINAE always applies in a constraint expression or a requirement // in a requires expression. assert(Active->DeductionInfo && "Missing deduction info pointer"); return Active->DeductionInfo; case CodeSynthesisContext::DeclaringSpecialMember: case CodeSynthesisContext::DeclaringImplicitEqualityComparison: case CodeSynthesisContext::DefiningSynthesizedFunction: case CodeSynthesisContext::InitializingStructuredBinding: case CodeSynthesisContext::MarkingClassDllexported: case CodeSynthesisContext::BuildingBuiltinDumpStructCall: case CodeSynthesisContext::BuildingDeductionGuides: // This happens in a context unrelated to template instantiation, so // there is no SFINAE. return std::nullopt; case CodeSynthesisContext::ExceptionSpecEvaluation: // FIXME: This should not be treated as a SFINAE context, because // we will cache an incorrect exception specification. However, clang // bootstrap relies this! See PR31692. break; case CodeSynthesisContext::Memoization: break; } // The inner context was transparent for SFINAE. If it occurred within a // non-instantiation SFINAE context, then SFINAE applies. if (Active->SavedInNonInstantiationSFINAEContext) return std::optional(nullptr); } return std::nullopt; } //===----------------------------------------------------------------------===/ // Template Instantiation for Types //===----------------------------------------------------------------------===/ namespace { class TemplateInstantiator : public TreeTransform { const MultiLevelTemplateArgumentList &TemplateArgs; SourceLocation Loc; DeclarationName Entity; bool EvaluateConstraints = true; public: typedef TreeTransform inherited; TemplateInstantiator(Sema &SemaRef, const MultiLevelTemplateArgumentList &TemplateArgs, SourceLocation Loc, DeclarationName Entity) : inherited(SemaRef), TemplateArgs(TemplateArgs), Loc(Loc), Entity(Entity) {} void setEvaluateConstraints(bool B) { EvaluateConstraints = B; } bool getEvaluateConstraints() { return EvaluateConstraints; } /// Determine whether the given type \p T has already been /// transformed. /// /// For the purposes of template instantiation, a type has already been /// transformed if it is NULL or if it is not dependent. bool AlreadyTransformed(QualType T); /// Returns the location of the entity being instantiated, if known. SourceLocation getBaseLocation() { return Loc; } /// Returns the name of the entity being instantiated, if any. DeclarationName getBaseEntity() { return Entity; } /// Sets the "base" location and entity when that /// information is known based on another transformation. void setBase(SourceLocation Loc, DeclarationName Entity) { this->Loc = Loc; this->Entity = Entity; } unsigned TransformTemplateDepth(unsigned Depth) { return TemplateArgs.getNewDepth(Depth); } std::optional getPackIndex(TemplateArgument Pack) { int Index = getSema().ArgumentPackSubstitutionIndex; if (Index == -1) return std::nullopt; return Pack.pack_size() - 1 - Index; } bool TryExpandParameterPacks(SourceLocation EllipsisLoc, SourceRange PatternRange, ArrayRef Unexpanded, bool &ShouldExpand, bool &RetainExpansion, std::optional &NumExpansions) { return getSema().CheckParameterPacksForExpansion(EllipsisLoc, PatternRange, Unexpanded, TemplateArgs, ShouldExpand, RetainExpansion, NumExpansions); } void ExpandingFunctionParameterPack(ParmVarDecl *Pack) { SemaRef.CurrentInstantiationScope->MakeInstantiatedLocalArgPack(Pack); } TemplateArgument ForgetPartiallySubstitutedPack() { TemplateArgument Result; if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){ MultiLevelTemplateArgumentList &TemplateArgs = const_cast(this->TemplateArgs); unsigned Depth, Index; std::tie(Depth, Index) = getDepthAndIndex(PartialPack); if (TemplateArgs.hasTemplateArgument(Depth, Index)) { Result = TemplateArgs(Depth, Index); TemplateArgs.setArgument(Depth, Index, TemplateArgument()); } } return Result; } void RememberPartiallySubstitutedPack(TemplateArgument Arg) { if (Arg.isNull()) return; if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){ MultiLevelTemplateArgumentList &TemplateArgs = const_cast(this->TemplateArgs); unsigned Depth, Index; std::tie(Depth, Index) = getDepthAndIndex(PartialPack); TemplateArgs.setArgument(Depth, Index, Arg); } } /// Transform the given declaration by instantiating a reference to /// this declaration. Decl *TransformDecl(SourceLocation Loc, Decl *D); void transformAttrs(Decl *Old, Decl *New) { SemaRef.InstantiateAttrs(TemplateArgs, Old, New); } void transformedLocalDecl(Decl *Old, ArrayRef NewDecls) { if (Old->isParameterPack()) { SemaRef.CurrentInstantiationScope->MakeInstantiatedLocalArgPack(Old); for (auto *New : NewDecls) SemaRef.CurrentInstantiationScope->InstantiatedLocalPackArg( Old, cast(New)); return; } assert(NewDecls.size() == 1 && "should only have multiple expansions for a pack"); Decl *New = NewDecls.front(); // If we've instantiated the call operator of a lambda or the call // operator template of a generic lambda, update the "instantiation of" // information. auto *NewMD = dyn_cast(New); if (NewMD && isLambdaCallOperator(NewMD)) { auto *OldMD = dyn_cast(Old); if (auto *NewTD = NewMD->getDescribedFunctionTemplate()) NewTD->setInstantiatedFromMemberTemplate( OldMD->getDescribedFunctionTemplate()); else NewMD->setInstantiationOfMemberFunction(OldMD, TSK_ImplicitInstantiation); } SemaRef.CurrentInstantiationScope->InstantiatedLocal(Old, New); // We recreated a local declaration, but not by instantiating it. There // may be pending dependent diagnostics to produce. if (auto *DC = dyn_cast(Old); DC && DC->isDependentContext() && DC->isFunctionOrMethod()) SemaRef.PerformDependentDiagnostics(DC, TemplateArgs); } /// Transform the definition of the given declaration by /// instantiating it. Decl *TransformDefinition(SourceLocation Loc, Decl *D); /// Transform the first qualifier within a scope by instantiating the /// declaration. NamedDecl *TransformFirstQualifierInScope(NamedDecl *D, SourceLocation Loc); /// Rebuild the exception declaration and register the declaration /// as an instantiated local. VarDecl *RebuildExceptionDecl(VarDecl *ExceptionDecl, TypeSourceInfo *Declarator, SourceLocation StartLoc, SourceLocation NameLoc, IdentifierInfo *Name); /// Rebuild the Objective-C exception declaration and register the /// declaration as an instantiated local. VarDecl *RebuildObjCExceptionDecl(VarDecl *ExceptionDecl, TypeSourceInfo *TSInfo, QualType T); /// Check for tag mismatches when instantiating an /// elaborated type. QualType RebuildElaboratedType(SourceLocation KeywordLoc, ElaboratedTypeKeyword Keyword, NestedNameSpecifierLoc QualifierLoc, QualType T); TemplateName TransformTemplateName(CXXScopeSpec &SS, TemplateName Name, SourceLocation NameLoc, QualType ObjectType = QualType(), NamedDecl *FirstQualifierInScope = nullptr, bool AllowInjectedClassName = false); const LoopHintAttr *TransformLoopHintAttr(const LoopHintAttr *LH); const NoInlineAttr *TransformStmtNoInlineAttr(const Stmt *OrigS, const Stmt *InstS, const NoInlineAttr *A); const AlwaysInlineAttr * TransformStmtAlwaysInlineAttr(const Stmt *OrigS, const Stmt *InstS, const AlwaysInlineAttr *A); ExprResult TransformPredefinedExpr(PredefinedExpr *E); ExprResult TransformDeclRefExpr(DeclRefExpr *E); ExprResult TransformCXXDefaultArgExpr(CXXDefaultArgExpr *E); ExprResult TransformTemplateParmRefExpr(DeclRefExpr *E, NonTypeTemplateParmDecl *D); ExprResult TransformSubstNonTypeTemplateParmPackExpr( SubstNonTypeTemplateParmPackExpr *E); ExprResult TransformSubstNonTypeTemplateParmExpr( SubstNonTypeTemplateParmExpr *E); /// Rebuild a DeclRefExpr for a VarDecl reference. ExprResult RebuildVarDeclRefExpr(VarDecl *PD, SourceLocation Loc); /// Transform a reference to a function or init-capture parameter pack. ExprResult TransformFunctionParmPackRefExpr(DeclRefExpr *E, VarDecl *PD); /// Transform a FunctionParmPackExpr which was built when we couldn't /// expand a function parameter pack reference which refers to an expanded /// pack. ExprResult TransformFunctionParmPackExpr(FunctionParmPackExpr *E); QualType TransformFunctionProtoType(TypeLocBuilder &TLB, FunctionProtoTypeLoc TL) { // Call the base version; it will forward to our overridden version below. return inherited::TransformFunctionProtoType(TLB, TL); } template QualType TransformFunctionProtoType(TypeLocBuilder &TLB, FunctionProtoTypeLoc TL, CXXRecordDecl *ThisContext, Qualifiers ThisTypeQuals, Fn TransformExceptionSpec); ParmVarDecl * TransformFunctionTypeParam(ParmVarDecl *OldParm, int indexAdjustment, std::optional NumExpansions, bool ExpectParameterPack); using inherited::TransformTemplateTypeParmType; /// Transforms a template type parameter type by performing /// substitution of the corresponding template type argument. QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB, TemplateTypeParmTypeLoc TL, bool SuppressObjCLifetime); QualType BuildSubstTemplateTypeParmType( TypeLocBuilder &TLB, bool SuppressObjCLifetime, bool Final, Decl *AssociatedDecl, unsigned Index, std::optional PackIndex, TemplateArgument Arg, SourceLocation NameLoc); /// Transforms an already-substituted template type parameter pack /// into either itself (if we aren't substituting into its pack expansion) /// or the appropriate substituted argument. using inherited::TransformSubstTemplateTypeParmPackType; QualType TransformSubstTemplateTypeParmPackType(TypeLocBuilder &TLB, SubstTemplateTypeParmPackTypeLoc TL, bool SuppressObjCLifetime); ExprResult TransformLambdaExpr(LambdaExpr *E) { LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); Sema::ConstraintEvalRAII RAII(*this); Sema::CodeSynthesisContext C; C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution; C.PointOfInstantiation = E->getBeginLoc(); SemaRef.pushCodeSynthesisContext(C); auto PopCtx = llvm::make_scope_exit([this] { SemaRef.popCodeSynthesisContext(); }); ExprResult Result = inherited::TransformLambdaExpr(E); if (Result.isInvalid()) return Result; CXXMethodDecl *MD = Result.getAs()->getCallOperator(); for (ParmVarDecl *PVD : MD->parameters()) { assert(PVD && "null in a parameter list"); if (!PVD->hasDefaultArg()) continue; Expr *UninstExpr = PVD->getUninstantiatedDefaultArg(); // FIXME: Obtain the source location for the '=' token. SourceLocation EqualLoc = UninstExpr->getBeginLoc(); if (SemaRef.SubstDefaultArgument(EqualLoc, PVD, TemplateArgs)) { // If substitution fails, the default argument is set to a // RecoveryExpr that wraps the uninstantiated default argument so // that downstream diagnostics are omitted. ExprResult ErrorResult = SemaRef.CreateRecoveryExpr( UninstExpr->getBeginLoc(), UninstExpr->getEndLoc(), { UninstExpr }, UninstExpr->getType()); if (ErrorResult.isUsable()) PVD->setDefaultArg(ErrorResult.get()); } } return Result; } ExprResult TransformRequiresExpr(RequiresExpr *E) { LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); ExprResult TransReq = inherited::TransformRequiresExpr(E); if (TransReq.isInvalid()) return TransReq; assert(TransReq.get() != E && "Do not change value of isSatisfied for the existing expression. " "Create a new expression instead."); if (E->getBody()->isDependentContext()) { Sema::SFINAETrap Trap(SemaRef); // We recreate the RequiresExpr body, but not by instantiating it. // Produce pending diagnostics for dependent access check. SemaRef.PerformDependentDiagnostics(E->getBody(), TemplateArgs); // FIXME: Store SFINAE diagnostics in RequiresExpr for diagnosis. if (Trap.hasErrorOccurred()) TransReq.getAs()->setSatisfied(false); } return TransReq; } bool TransformRequiresExprRequirements( ArrayRef Reqs, SmallVectorImpl &Transformed) { bool SatisfactionDetermined = false; for (concepts::Requirement *Req : Reqs) { concepts::Requirement *TransReq = nullptr; if (!SatisfactionDetermined) { if (auto *TypeReq = dyn_cast(Req)) TransReq = TransformTypeRequirement(TypeReq); else if (auto *ExprReq = dyn_cast(Req)) TransReq = TransformExprRequirement(ExprReq); else TransReq = TransformNestedRequirement( cast(Req)); if (!TransReq) return true; if (!TransReq->isDependent() && !TransReq->isSatisfied()) // [expr.prim.req]p6 // [...] The substitution and semantic constraint checking // proceeds in lexical order and stops when a condition that // determines the result of the requires-expression is // encountered. [..] SatisfactionDetermined = true; } else TransReq = Req; Transformed.push_back(TransReq); } return false; } TemplateParameterList *TransformTemplateParameterList( TemplateParameterList *OrigTPL) { if (!OrigTPL || !OrigTPL->size()) return OrigTPL; DeclContext *Owner = OrigTPL->getParam(0)->getDeclContext(); TemplateDeclInstantiator DeclInstantiator(getSema(), /* DeclContext *Owner */ Owner, TemplateArgs); DeclInstantiator.setEvaluateConstraints(EvaluateConstraints); return DeclInstantiator.SubstTemplateParams(OrigTPL); } concepts::TypeRequirement * TransformTypeRequirement(concepts::TypeRequirement *Req); concepts::ExprRequirement * TransformExprRequirement(concepts::ExprRequirement *Req); concepts::NestedRequirement * TransformNestedRequirement(concepts::NestedRequirement *Req); ExprResult TransformRequiresTypeParams( SourceLocation KWLoc, SourceLocation RBraceLoc, const RequiresExpr *RE, RequiresExprBodyDecl *Body, ArrayRef Params, SmallVectorImpl &PTypes, SmallVectorImpl &TransParams, Sema::ExtParameterInfoBuilder &PInfos); private: ExprResult transformNonTypeTemplateParmRef(Decl *AssociatedDecl, const NonTypeTemplateParmDecl *parm, SourceLocation loc, TemplateArgument arg, std::optional PackIndex); }; } bool TemplateInstantiator::AlreadyTransformed(QualType T) { if (T.isNull()) return true; if (T->isInstantiationDependentType() || T->isVariablyModifiedType()) return false; getSema().MarkDeclarationsReferencedInType(Loc, T); return true; } static TemplateArgument getPackSubstitutedTemplateArgument(Sema &S, TemplateArgument Arg) { assert(S.ArgumentPackSubstitutionIndex >= 0); assert(S.ArgumentPackSubstitutionIndex < (int)Arg.pack_size()); Arg = Arg.pack_begin()[S.ArgumentPackSubstitutionIndex]; if (Arg.isPackExpansion()) Arg = Arg.getPackExpansionPattern(); return Arg; } Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) { if (!D) return nullptr; if (TemplateTemplateParmDecl *TTP = dyn_cast(D)) { if (TTP->getDepth() < TemplateArgs.getNumLevels()) { // If the corresponding template argument is NULL or non-existent, it's // because we are performing instantiation from explicitly-specified // template arguments in a function template, but there were some // arguments left unspecified. if (!TemplateArgs.hasTemplateArgument(TTP->getDepth(), TTP->getPosition())) return D; TemplateArgument Arg = TemplateArgs(TTP->getDepth(), TTP->getPosition()); if (TTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); } TemplateName Template = Arg.getAsTemplate().getNameToSubstitute(); assert(!Template.isNull() && Template.getAsTemplateDecl() && "Wrong kind of template template argument"); return Template.getAsTemplateDecl(); } // Fall through to find the instantiated declaration for this template // template parameter. } return SemaRef.FindInstantiatedDecl(Loc, cast(D), TemplateArgs); } Decl *TemplateInstantiator::TransformDefinition(SourceLocation Loc, Decl *D) { Decl *Inst = getSema().SubstDecl(D, getSema().CurContext, TemplateArgs); if (!Inst) return nullptr; getSema().CurrentInstantiationScope->InstantiatedLocal(D, Inst); return Inst; } NamedDecl * TemplateInstantiator::TransformFirstQualifierInScope(NamedDecl *D, SourceLocation Loc) { // If the first part of the nested-name-specifier was a template type // parameter, instantiate that type parameter down to a tag type. if (TemplateTypeParmDecl *TTPD = dyn_cast_or_null(D)) { const TemplateTypeParmType *TTP = cast(getSema().Context.getTypeDeclType(TTPD)); if (TTP->getDepth() < TemplateArgs.getNumLevels()) { // FIXME: This needs testing w/ member access expressions. TemplateArgument Arg = TemplateArgs(TTP->getDepth(), TTP->getIndex()); if (TTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); if (getSema().ArgumentPackSubstitutionIndex == -1) return nullptr; Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); } QualType T = Arg.getAsType(); if (T.isNull()) return cast_or_null(TransformDecl(Loc, D)); if (const TagType *Tag = T->getAs()) return Tag->getDecl(); // The resulting type is not a tag; complain. getSema().Diag(Loc, diag::err_nested_name_spec_non_tag) << T; return nullptr; } } return cast_or_null(TransformDecl(Loc, D)); } VarDecl * TemplateInstantiator::RebuildExceptionDecl(VarDecl *ExceptionDecl, TypeSourceInfo *Declarator, SourceLocation StartLoc, SourceLocation NameLoc, IdentifierInfo *Name) { VarDecl *Var = inherited::RebuildExceptionDecl(ExceptionDecl, Declarator, StartLoc, NameLoc, Name); if (Var) getSema().CurrentInstantiationScope->InstantiatedLocal(ExceptionDecl, Var); return Var; } VarDecl *TemplateInstantiator::RebuildObjCExceptionDecl(VarDecl *ExceptionDecl, TypeSourceInfo *TSInfo, QualType T) { VarDecl *Var = inherited::RebuildObjCExceptionDecl(ExceptionDecl, TSInfo, T); if (Var) getSema().CurrentInstantiationScope->InstantiatedLocal(ExceptionDecl, Var); return Var; } QualType TemplateInstantiator::RebuildElaboratedType(SourceLocation KeywordLoc, ElaboratedTypeKeyword Keyword, NestedNameSpecifierLoc QualifierLoc, QualType T) { if (const TagType *TT = T->getAs()) { TagDecl* TD = TT->getDecl(); SourceLocation TagLocation = KeywordLoc; IdentifierInfo *Id = TD->getIdentifier(); // TODO: should we even warn on struct/class mismatches for this? Seems // like it's likely to produce a lot of spurious errors. if (Id && Keyword != ETK_None && Keyword != ETK_Typename) { TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForKeyword(Keyword); if (!SemaRef.isAcceptableTagRedeclaration(TD, Kind, /*isDefinition*/false, TagLocation, Id)) { SemaRef.Diag(TagLocation, diag::err_use_with_wrong_tag) << Id << FixItHint::CreateReplacement(SourceRange(TagLocation), TD->getKindName()); SemaRef.Diag(TD->getLocation(), diag::note_previous_use); } } } return inherited::RebuildElaboratedType(KeywordLoc, Keyword, QualifierLoc, T); } TemplateName TemplateInstantiator::TransformTemplateName( CXXScopeSpec &SS, TemplateName Name, SourceLocation NameLoc, QualType ObjectType, NamedDecl *FirstQualifierInScope, bool AllowInjectedClassName) { if (TemplateTemplateParmDecl *TTP = dyn_cast_or_null(Name.getAsTemplateDecl())) { if (TTP->getDepth() < TemplateArgs.getNumLevels()) { // If the corresponding template argument is NULL or non-existent, it's // because we are performing instantiation from explicitly-specified // template arguments in a function template, but there were some // arguments left unspecified. if (!TemplateArgs.hasTemplateArgument(TTP->getDepth(), TTP->getPosition())) return Name; TemplateArgument Arg = TemplateArgs(TTP->getDepth(), TTP->getPosition()); if (TemplateArgs.isRewrite()) { // We're rewriting the template parameter as a reference to another // template parameter. if (Arg.getKind() == TemplateArgument::Pack) { assert(Arg.pack_size() == 1 && Arg.pack_begin()->isPackExpansion() && "unexpected pack arguments in template rewrite"); Arg = Arg.pack_begin()->getPackExpansionPattern(); } assert(Arg.getKind() == TemplateArgument::Template && "unexpected nontype template argument kind in template rewrite"); return Arg.getAsTemplate(); } auto [AssociatedDecl, Final] = TemplateArgs.getAssociatedDecl(TTP->getDepth()); std::optional PackIndex; if (TTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); if (getSema().ArgumentPackSubstitutionIndex == -1) { // We have the template argument pack to substitute, but we're not // actually expanding the enclosing pack expansion yet. So, just // keep the entire argument pack. return getSema().Context.getSubstTemplateTemplateParmPack( Arg, AssociatedDecl, TTP->getIndex(), Final); } PackIndex = getPackIndex(Arg); Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); } TemplateName Template = Arg.getAsTemplate().getNameToSubstitute(); assert(!Template.isNull() && "Null template template argument"); assert(!Template.getAsQualifiedTemplateName() && "template decl to substitute is qualified?"); if (Final) return Template; return getSema().Context.getSubstTemplateTemplateParm( Template, AssociatedDecl, TTP->getIndex(), PackIndex); } } if (SubstTemplateTemplateParmPackStorage *SubstPack = Name.getAsSubstTemplateTemplateParmPack()) { if (getSema().ArgumentPackSubstitutionIndex == -1) return Name; TemplateArgument Pack = SubstPack->getArgumentPack(); TemplateName Template = getPackSubstitutedTemplateArgument(getSema(), Pack).getAsTemplate(); if (SubstPack->getFinal()) return Template; return getSema().Context.getSubstTemplateTemplateParm( Template.getNameToSubstitute(), SubstPack->getAssociatedDecl(), SubstPack->getIndex(), getPackIndex(Pack)); } return inherited::TransformTemplateName(SS, Name, NameLoc, ObjectType, FirstQualifierInScope, AllowInjectedClassName); } ExprResult TemplateInstantiator::TransformPredefinedExpr(PredefinedExpr *E) { if (!E->isTypeDependent()) return E; return getSema().BuildPredefinedExpr(E->getLocation(), E->getIdentKind()); } ExprResult TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E, NonTypeTemplateParmDecl *NTTP) { // If the corresponding template argument is NULL or non-existent, it's // because we are performing instantiation from explicitly-specified // template arguments in a function template, but there were some // arguments left unspecified. if (!TemplateArgs.hasTemplateArgument(NTTP->getDepth(), NTTP->getPosition())) return E; TemplateArgument Arg = TemplateArgs(NTTP->getDepth(), NTTP->getPosition()); if (TemplateArgs.isRewrite()) { // We're rewriting the template parameter as a reference to another // template parameter. if (Arg.getKind() == TemplateArgument::Pack) { assert(Arg.pack_size() == 1 && Arg.pack_begin()->isPackExpansion() && "unexpected pack arguments in template rewrite"); Arg = Arg.pack_begin()->getPackExpansionPattern(); } assert(Arg.getKind() == TemplateArgument::Expression && "unexpected nontype template argument kind in template rewrite"); // FIXME: This can lead to the same subexpression appearing multiple times // in a complete expression. return Arg.getAsExpr(); } auto [AssociatedDecl, _] = TemplateArgs.getAssociatedDecl(NTTP->getDepth()); std::optional PackIndex; if (NTTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); if (getSema().ArgumentPackSubstitutionIndex == -1) { // We have an argument pack, but we can't select a particular argument // out of it yet. Therefore, we'll build an expression to hold on to that // argument pack. QualType TargetType = SemaRef.SubstType(NTTP->getType(), TemplateArgs, E->getLocation(), NTTP->getDeclName()); if (TargetType.isNull()) return ExprError(); QualType ExprType = TargetType.getNonLValueExprType(SemaRef.Context); if (TargetType->isRecordType()) ExprType.addConst(); // FIXME: Pass in Final. return new (SemaRef.Context) SubstNonTypeTemplateParmPackExpr( ExprType, TargetType->isReferenceType() ? VK_LValue : VK_PRValue, E->getLocation(), Arg, AssociatedDecl, NTTP->getPosition()); } PackIndex = getPackIndex(Arg); Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); } // FIXME: Don't put subst node on Final replacement. return transformNonTypeTemplateParmRef(AssociatedDecl, NTTP, E->getLocation(), Arg, PackIndex); } const LoopHintAttr * TemplateInstantiator::TransformLoopHintAttr(const LoopHintAttr *LH) { Expr *TransformedExpr = getDerived().TransformExpr(LH->getValue()).get(); if (TransformedExpr == LH->getValue()) return LH; // Generate error if there is a problem with the value. if (getSema().CheckLoopHintExpr(TransformedExpr, LH->getLocation())) return LH; // Create new LoopHintValueAttr with integral expression in place of the // non-type template parameter. return LoopHintAttr::CreateImplicit(getSema().Context, LH->getOption(), LH->getState(), TransformedExpr, *LH); } const NoInlineAttr *TemplateInstantiator::TransformStmtNoInlineAttr( const Stmt *OrigS, const Stmt *InstS, const NoInlineAttr *A) { if (!A || getSema().CheckNoInlineAttr(OrigS, InstS, *A)) return nullptr; return A; } const AlwaysInlineAttr *TemplateInstantiator::TransformStmtAlwaysInlineAttr( const Stmt *OrigS, const Stmt *InstS, const AlwaysInlineAttr *A) { if (!A || getSema().CheckAlwaysInlineAttr(OrigS, InstS, *A)) return nullptr; return A; } ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( Decl *AssociatedDecl, const NonTypeTemplateParmDecl *parm, SourceLocation loc, TemplateArgument arg, std::optional PackIndex) { ExprResult result; // Determine the substituted parameter type. We can usually infer this from // the template argument, but not always. auto SubstParamType = [&] { QualType T; if (parm->isExpandedParameterPack()) T = parm->getExpansionType(SemaRef.ArgumentPackSubstitutionIndex); else T = parm->getType(); if (parm->isParameterPack() && isa(T)) T = cast(T)->getPattern(); return SemaRef.SubstType(T, TemplateArgs, loc, parm->getDeclName()); }; bool refParam = false; // The template argument itself might be an expression, in which case we just // return that expression. This happens when substituting into an alias // template. if (arg.getKind() == TemplateArgument::Expression) { Expr *argExpr = arg.getAsExpr(); result = argExpr; if (argExpr->isLValue()) { if (argExpr->getType()->isRecordType()) { // Check whether the parameter was actually a reference. QualType paramType = SubstParamType(); if (paramType.isNull()) return ExprError(); refParam = paramType->isReferenceType(); } else { refParam = true; } } } else if (arg.getKind() == TemplateArgument::Declaration || arg.getKind() == TemplateArgument::NullPtr) { ValueDecl *VD; if (arg.getKind() == TemplateArgument::Declaration) { VD = arg.getAsDecl(); // Find the instantiation of the template argument. This is // required for nested templates. VD = cast_or_null( getSema().FindInstantiatedDecl(loc, VD, TemplateArgs)); if (!VD) return ExprError(); } else { // Propagate NULL template argument. VD = nullptr; } QualType paramType = VD ? arg.getParamTypeForDecl() : arg.getNullPtrType(); assert(!paramType.isNull() && "type substitution failed for param type"); assert(!paramType->isDependentType() && "param type still dependent"); result = SemaRef.BuildExpressionFromDeclTemplateArgument(arg, paramType, loc); refParam = paramType->isReferenceType(); } else { result = SemaRef.BuildExpressionFromIntegralTemplateArgument(arg, loc); assert(result.isInvalid() || SemaRef.Context.hasSameType(result.get()->getType(), arg.getIntegralType())); } if (result.isInvalid()) return ExprError(); Expr *resultExpr = result.get(); // FIXME: Don't put subst node on final replacement. return new (SemaRef.Context) SubstNonTypeTemplateParmExpr( resultExpr->getType(), resultExpr->getValueKind(), loc, resultExpr, AssociatedDecl, parm->getIndex(), PackIndex, refParam); } ExprResult TemplateInstantiator::TransformSubstNonTypeTemplateParmPackExpr( SubstNonTypeTemplateParmPackExpr *E) { if (getSema().ArgumentPackSubstitutionIndex == -1) { // We aren't expanding the parameter pack, so just return ourselves. return E; } TemplateArgument Pack = E->getArgumentPack(); TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack); // FIXME: Don't put subst node on final replacement. return transformNonTypeTemplateParmRef( E->getAssociatedDecl(), E->getParameterPack(), E->getParameterPackLocation(), Arg, getPackIndex(Pack)); } ExprResult TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr( SubstNonTypeTemplateParmExpr *E) { ExprResult SubstReplacement = E->getReplacement(); if (!isa(SubstReplacement.get())) SubstReplacement = TransformExpr(E->getReplacement()); if (SubstReplacement.isInvalid()) return true; QualType SubstType = TransformType(E->getParameterType(getSema().Context)); if (SubstType.isNull()) return true; // The type may have been previously dependent and not now, which means we // might have to implicit cast the argument to the new type, for example: // template // concept C = sizeof(U) == 4; // void foo() requires C<2, 'a'> { } // When normalizing foo(), we first form the normalized constraints of C: // AtomicExpr(sizeof(U) == 4, // U=SubstNonTypeTemplateParmExpr(Param=U, // Expr=DeclRef(U), // Type=decltype(T))) // Then we substitute T = 2, U = 'a' into the parameter mapping, and need to // produce: // AtomicExpr(sizeof(U) == 4, // U=SubstNonTypeTemplateParmExpr(Param=U, // Expr=ImpCast( // decltype(2), // SubstNTTPE(Param=U, Expr='a', // Type=char)), // Type=decltype(2))) // The call to CheckTemplateArgument here produces the ImpCast. TemplateArgument SugaredConverted, CanonicalConverted; if (SemaRef .CheckTemplateArgument(E->getParameter(), SubstType, SubstReplacement.get(), SugaredConverted, CanonicalConverted, Sema::CTAK_Specified) .isInvalid()) return true; return transformNonTypeTemplateParmRef(E->getAssociatedDecl(), E->getParameter(), E->getExprLoc(), SugaredConverted, E->getPackIndex()); } ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(VarDecl *PD, SourceLocation Loc) { DeclarationNameInfo NameInfo(PD->getDeclName(), Loc); return getSema().BuildDeclarationNameExpr(CXXScopeSpec(), NameInfo, PD); } ExprResult TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { if (getSema().ArgumentPackSubstitutionIndex != -1) { // We can expand this parameter pack now. VarDecl *D = E->getExpansion(getSema().ArgumentPackSubstitutionIndex); VarDecl *VD = cast_or_null(TransformDecl(E->getExprLoc(), D)); if (!VD) return ExprError(); return RebuildVarDeclRefExpr(VD, E->getExprLoc()); } QualType T = TransformType(E->getType()); if (T.isNull()) return ExprError(); // Transform each of the parameter expansions into the corresponding // parameters in the instantiation of the function decl. SmallVector Vars; Vars.reserve(E->getNumExpansions()); for (FunctionParmPackExpr::iterator I = E->begin(), End = E->end(); I != End; ++I) { VarDecl *D = cast_or_null(TransformDecl(E->getExprLoc(), *I)); if (!D) return ExprError(); Vars.push_back(D); } auto *PackExpr = FunctionParmPackExpr::Create(getSema().Context, T, E->getParameterPack(), E->getParameterPackLocation(), Vars); getSema().MarkFunctionParmPackReferenced(PackExpr); return PackExpr; } ExprResult TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, VarDecl *PD) { typedef LocalInstantiationScope::DeclArgumentPack DeclArgumentPack; llvm::PointerUnion *Found = getSema().CurrentInstantiationScope->findInstantiationOf(PD); assert(Found && "no instantiation for parameter pack"); Decl *TransformedDecl; if (DeclArgumentPack *Pack = Found->dyn_cast()) { // If this is a reference to a function parameter pack which we can // substitute but can't yet expand, build a FunctionParmPackExpr for it. if (getSema().ArgumentPackSubstitutionIndex == -1) { QualType T = TransformType(E->getType()); if (T.isNull()) return ExprError(); auto *PackExpr = FunctionParmPackExpr::Create(getSema().Context, T, PD, E->getExprLoc(), *Pack); getSema().MarkFunctionParmPackReferenced(PackExpr); return PackExpr; } TransformedDecl = (*Pack)[getSema().ArgumentPackSubstitutionIndex]; } else { TransformedDecl = Found->get(); } // We have either an unexpanded pack or a specific expansion. return RebuildVarDeclRefExpr(cast(TransformedDecl), E->getExprLoc()); } ExprResult TemplateInstantiator::TransformDeclRefExpr(DeclRefExpr *E) { NamedDecl *D = E->getDecl(); // Handle references to non-type template parameters and non-type template // parameter packs. if (NonTypeTemplateParmDecl *NTTP = dyn_cast(D)) { if (NTTP->getDepth() < TemplateArgs.getNumLevels()) return TransformTemplateParmRefExpr(E, NTTP); // We have a non-type template parameter that isn't fully substituted; // FindInstantiatedDecl will find it in the local instantiation scope. } // Handle references to function parameter packs. if (VarDecl *PD = dyn_cast(D)) if (PD->isParameterPack()) return TransformFunctionParmPackRefExpr(E, PD); return inherited::TransformDeclRefExpr(E); } ExprResult TemplateInstantiator::TransformCXXDefaultArgExpr( CXXDefaultArgExpr *E) { assert(!cast(E->getParam()->getDeclContext())-> getDescribedFunctionTemplate() && "Default arg expressions are never formed in dependent cases."); return SemaRef.BuildCXXDefaultArgExpr( E->getUsedLocation(), cast(E->getParam()->getDeclContext()), E->getParam()); } template QualType TemplateInstantiator::TransformFunctionProtoType(TypeLocBuilder &TLB, FunctionProtoTypeLoc TL, CXXRecordDecl *ThisContext, Qualifiers ThisTypeQuals, Fn TransformExceptionSpec) { // We need a local instantiation scope for this function prototype. LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); return inherited::TransformFunctionProtoType( TLB, TL, ThisContext, ThisTypeQuals, TransformExceptionSpec); } ParmVarDecl *TemplateInstantiator::TransformFunctionTypeParam( ParmVarDecl *OldParm, int indexAdjustment, std::optional NumExpansions, bool ExpectParameterPack) { auto NewParm = SemaRef.SubstParmVarDecl( OldParm, TemplateArgs, indexAdjustment, NumExpansions, ExpectParameterPack, EvaluateConstraints); if (NewParm && SemaRef.getLangOpts().OpenCL) SemaRef.deduceOpenCLAddressSpace(NewParm); return NewParm; } QualType TemplateInstantiator::BuildSubstTemplateTypeParmType( TypeLocBuilder &TLB, bool SuppressObjCLifetime, bool Final, Decl *AssociatedDecl, unsigned Index, std::optional PackIndex, TemplateArgument Arg, SourceLocation NameLoc) { QualType Replacement = Arg.getAsType(); // If the template parameter had ObjC lifetime qualifiers, // then any such qualifiers on the replacement type are ignored. if (SuppressObjCLifetime) { Qualifiers RQs; RQs = Replacement.getQualifiers(); RQs.removeObjCLifetime(); Replacement = SemaRef.Context.getQualifiedType(Replacement.getUnqualifiedType(), RQs); } if (Final) { TLB.pushTrivial(SemaRef.Context, Replacement, NameLoc); return Replacement; } // TODO: only do this uniquing once, at the start of instantiation. QualType Result = getSema().Context.getSubstTemplateTypeParmType( Replacement, AssociatedDecl, Index, PackIndex); SubstTemplateTypeParmTypeLoc NewTL = TLB.push(Result); NewTL.setNameLoc(NameLoc); return Result; } QualType TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB, TemplateTypeParmTypeLoc TL, bool SuppressObjCLifetime) { const TemplateTypeParmType *T = TL.getTypePtr(); if (T->getDepth() < TemplateArgs.getNumLevels()) { // Replace the template type parameter with its corresponding // template argument. // If the corresponding template argument is NULL or doesn't exist, it's // because we are performing instantiation from explicitly-specified // template arguments in a function template class, but there were some // arguments left unspecified. if (!TemplateArgs.hasTemplateArgument(T->getDepth(), T->getIndex())) { TemplateTypeParmTypeLoc NewTL = TLB.push(TL.getType()); NewTL.setNameLoc(TL.getNameLoc()); return TL.getType(); } TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex()); if (TemplateArgs.isRewrite()) { // We're rewriting the template parameter as a reference to another // template parameter. if (Arg.getKind() == TemplateArgument::Pack) { assert(Arg.pack_size() == 1 && Arg.pack_begin()->isPackExpansion() && "unexpected pack arguments in template rewrite"); Arg = Arg.pack_begin()->getPackExpansionPattern(); } assert(Arg.getKind() == TemplateArgument::Type && "unexpected nontype template argument kind in template rewrite"); QualType NewT = Arg.getAsType(); assert(isa(NewT) && "type parm not rewritten to type parm"); auto NewTL = TLB.push(NewT); NewTL.setNameLoc(TL.getNameLoc()); return NewT; } auto [AssociatedDecl, Final] = TemplateArgs.getAssociatedDecl(T->getDepth()); std::optional PackIndex; if (T->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); if (getSema().ArgumentPackSubstitutionIndex == -1) { // We have the template argument pack, but we're not expanding the // enclosing pack expansion yet. Just save the template argument // pack for later substitution. QualType Result = getSema().Context.getSubstTemplateTypeParmPackType( AssociatedDecl, T->getIndex(), Final, Arg); SubstTemplateTypeParmPackTypeLoc NewTL = TLB.push(Result); NewTL.setNameLoc(TL.getNameLoc()); return Result; } // PackIndex starts from last element. PackIndex = getPackIndex(Arg); Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); } assert(Arg.getKind() == TemplateArgument::Type && "Template argument kind mismatch"); return BuildSubstTemplateTypeParmType(TLB, SuppressObjCLifetime, Final, AssociatedDecl, T->getIndex(), PackIndex, Arg, TL.getNameLoc()); } // The template type parameter comes from an inner template (e.g., // the template parameter list of a member template inside the // template we are instantiating). Create a new template type // parameter with the template "level" reduced by one. TemplateTypeParmDecl *NewTTPDecl = nullptr; if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl()) NewTTPDecl = cast_or_null( TransformDecl(TL.getNameLoc(), OldTTPDecl)); QualType Result = getSema().Context.getTemplateTypeParmType( T->getDepth() - TemplateArgs.getNumSubstitutedLevels(), T->getIndex(), T->isParameterPack(), NewTTPDecl); TemplateTypeParmTypeLoc NewTL = TLB.push(Result); NewTL.setNameLoc(TL.getNameLoc()); return Result; } QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType( TypeLocBuilder &TLB, SubstTemplateTypeParmPackTypeLoc TL, bool SuppressObjCLifetime) { const SubstTemplateTypeParmPackType *T = TL.getTypePtr(); Decl *NewReplaced = TransformDecl(TL.getNameLoc(), T->getAssociatedDecl()); if (getSema().ArgumentPackSubstitutionIndex == -1) { // We aren't expanding the parameter pack, so just return ourselves. QualType Result = TL.getType(); if (NewReplaced != T->getAssociatedDecl()) Result = getSema().Context.getSubstTemplateTypeParmPackType( NewReplaced, T->getIndex(), T->getFinal(), T->getArgumentPack()); SubstTemplateTypeParmPackTypeLoc NewTL = TLB.push(Result); NewTL.setNameLoc(TL.getNameLoc()); return Result; } TemplateArgument Pack = T->getArgumentPack(); TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack); return BuildSubstTemplateTypeParmType( TLB, SuppressObjCLifetime, T->getFinal(), NewReplaced, T->getIndex(), getPackIndex(Pack), Arg, TL.getNameLoc()); } -template static concepts::Requirement::SubstitutionDiagnostic * -createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) { +createSubstDiag(Sema &S, TemplateDeductionInfo &Info, + concepts::EntityPrinter Printer) { SmallString<128> Message; SourceLocation ErrorLoc; if (Info.hasSFINAEDiagnostic()) { PartialDiagnosticAt PDA(SourceLocation(), PartialDiagnostic::NullDiagnostic{}); Info.takeSFINAEDiagnostic(PDA); PDA.second.EmitToString(S.getDiagnostics(), Message); ErrorLoc = PDA.first; } else { ErrorLoc = Info.getLocation(); } char *MessageBuf = new (S.Context) char[Message.size()]; std::copy(Message.begin(), Message.end(), MessageBuf); SmallString<128> Entity; llvm::raw_svector_ostream OS(Entity); Printer(OS); char *EntityBuf = new (S.Context) char[Entity.size()]; std::copy(Entity.begin(), Entity.end(), EntityBuf); return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{ StringRef(EntityBuf, Entity.size()), ErrorLoc, StringRef(MessageBuf, Message.size())}; } +concepts::Requirement::SubstitutionDiagnostic * +concepts::createSubstDiagAt(Sema &S, SourceLocation Location, + EntityPrinter Printer) { + SmallString<128> Entity; + llvm::raw_svector_ostream OS(Entity); + Printer(OS); + char *EntityBuf = new (S.Context) char[Entity.size()]; + llvm::copy(Entity, EntityBuf); + return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{ + /*SubstitutedEntity=*/StringRef(EntityBuf, Entity.size()), + /*DiagLoc=*/Location, /*DiagMessage=*/StringRef()}; +} + ExprResult TemplateInstantiator::TransformRequiresTypeParams( SourceLocation KWLoc, SourceLocation RBraceLoc, const RequiresExpr *RE, RequiresExprBodyDecl *Body, ArrayRef Params, SmallVectorImpl &PTypes, SmallVectorImpl &TransParams, Sema::ExtParameterInfoBuilder &PInfos) { TemplateDeductionInfo Info(KWLoc); Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc, RE, Info, SourceRange{KWLoc, RBraceLoc}); Sema::SFINAETrap Trap(SemaRef); unsigned ErrorIdx; if (getDerived().TransformFunctionTypeParams( KWLoc, Params, /*ParamTypes=*/nullptr, /*ParamInfos=*/nullptr, PTypes, &TransParams, PInfos, &ErrorIdx) || Trap.hasErrorOccurred()) { SmallVector TransReqs; ParmVarDecl *FailedDecl = Params[ErrorIdx]; // Add a 'failed' Requirement to contain the error that caused the failure // here. TransReqs.push_back(RebuildTypeRequirement(createSubstDiag( SemaRef, Info, [&](llvm::raw_ostream &OS) { OS << *FailedDecl; }))); return getDerived().RebuildRequiresExpr(KWLoc, Body, TransParams, TransReqs, RBraceLoc); } return ExprResult{}; } concepts::TypeRequirement * TemplateInstantiator::TransformTypeRequirement(concepts::TypeRequirement *Req) { if (!Req->isDependent() && !AlwaysRebuild()) return Req; if (Req->isSubstitutionFailure()) { if (AlwaysRebuild()) return RebuildTypeRequirement( Req->getSubstitutionDiagnostic()); return Req; } Sema::SFINAETrap Trap(SemaRef); TemplateDeductionInfo Info(Req->getType()->getTypeLoc().getBeginLoc()); Sema::InstantiatingTemplate TypeInst(SemaRef, Req->getType()->getTypeLoc().getBeginLoc(), Req, Info, Req->getType()->getTypeLoc().getSourceRange()); if (TypeInst.isInvalid()) return nullptr; TypeSourceInfo *TransType = TransformType(Req->getType()); if (!TransType || Trap.hasErrorOccurred()) return RebuildTypeRequirement(createSubstDiag(SemaRef, Info, [&] (llvm::raw_ostream& OS) { Req->getType()->getType().print(OS, SemaRef.getPrintingPolicy()); })); return RebuildTypeRequirement(TransType); } concepts::ExprRequirement * TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) { if (!Req->isDependent() && !AlwaysRebuild()) return Req; Sema::SFINAETrap Trap(SemaRef); llvm::PointerUnion TransExpr; if (Req->isExprSubstitutionFailure()) TransExpr = Req->getExprSubstitutionDiagnostic(); else { Expr *E = Req->getExpr(); TemplateDeductionInfo Info(E->getBeginLoc()); Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req, Info, E->getSourceRange()); if (ExprInst.isInvalid()) return nullptr; ExprResult TransExprRes = TransformExpr(E); if (!TransExprRes.isInvalid() && !Trap.hasErrorOccurred() && TransExprRes.get()->hasPlaceholderType()) TransExprRes = SemaRef.CheckPlaceholderExpr(TransExprRes.get()); if (TransExprRes.isInvalid() || Trap.hasErrorOccurred()) TransExpr = createSubstDiag(SemaRef, Info, [&](llvm::raw_ostream &OS) { E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy()); }); else TransExpr = TransExprRes.get(); } std::optional TransRetReq; const auto &RetReq = Req->getReturnTypeRequirement(); if (RetReq.isEmpty()) TransRetReq.emplace(); else if (RetReq.isSubstitutionFailure()) TransRetReq.emplace(RetReq.getSubstitutionDiagnostic()); else if (RetReq.isTypeConstraint()) { TemplateParameterList *OrigTPL = RetReq.getTypeConstraintTemplateParameterList(); TemplateDeductionInfo Info(OrigTPL->getTemplateLoc()); Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(), Req, Info, OrigTPL->getSourceRange()); if (TPLInst.isInvalid()) return nullptr; TemplateParameterList *TPL = TransformTemplateParameterList(OrigTPL); if (!TPL) TransRetReq.emplace(createSubstDiag(SemaRef, Info, [&] (llvm::raw_ostream& OS) { RetReq.getTypeConstraint()->getImmediatelyDeclaredConstraint() ->printPretty(OS, nullptr, SemaRef.getPrintingPolicy()); })); else { TPLInst.Clear(); TransRetReq.emplace(TPL); } } assert(TransRetReq && "All code paths leading here must set TransRetReq"); if (Expr *E = TransExpr.dyn_cast()) return RebuildExprRequirement(E, Req->isSimple(), Req->getNoexceptLoc(), std::move(*TransRetReq)); return RebuildExprRequirement( TransExpr.get(), Req->isSimple(), Req->getNoexceptLoc(), std::move(*TransRetReq)); } concepts::NestedRequirement * TemplateInstantiator::TransformNestedRequirement( concepts::NestedRequirement *Req) { if (!Req->isDependent() && !AlwaysRebuild()) return Req; if (Req->hasInvalidConstraint()) { if (AlwaysRebuild()) return RebuildNestedRequirement(Req->getInvalidConstraintEntity(), Req->getConstraintSatisfaction()); return Req; } Sema::InstantiatingTemplate ReqInst(SemaRef, Req->getConstraintExpr()->getBeginLoc(), Req, Sema::InstantiatingTemplate::ConstraintsCheck{}, Req->getConstraintExpr()->getSourceRange()); ExprResult TransConstraint; ConstraintSatisfaction Satisfaction; TemplateDeductionInfo Info(Req->getConstraintExpr()->getBeginLoc()); { EnterExpressionEvaluationContext ContextRAII( SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated); Sema::SFINAETrap Trap(SemaRef); Sema::InstantiatingTemplate ConstrInst(SemaRef, Req->getConstraintExpr()->getBeginLoc(), Req, Info, Req->getConstraintExpr()->getSourceRange()); if (ConstrInst.isInvalid()) return nullptr; llvm::SmallVector Result; if (!SemaRef.CheckConstraintSatisfaction( nullptr, {Req->getConstraintExpr()}, Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(), Satisfaction) && !Result.empty()) TransConstraint = Result[0]; assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled " "by CheckConstraintSatisfaction."); } if (TransConstraint.isUsable() && TransConstraint.get()->isInstantiationDependent()) return new (SemaRef.Context) concepts::NestedRequirement(TransConstraint.get()); if (TransConstraint.isInvalid() || !TransConstraint.get() || Satisfaction.HasSubstitutionFailure()) { SmallString<128> Entity; llvm::raw_svector_ostream OS(Entity); Req->getConstraintExpr()->printPretty(OS, nullptr, SemaRef.getPrintingPolicy()); char *EntityBuf = new (SemaRef.Context) char[Entity.size()]; std::copy(Entity.begin(), Entity.end(), EntityBuf); return new (SemaRef.Context) concepts::NestedRequirement( SemaRef.Context, StringRef(EntityBuf, Entity.size()), Satisfaction); } return new (SemaRef.Context) concepts::NestedRequirement( SemaRef.Context, TransConstraint.get(), Satisfaction); } /// Perform substitution on the type T with a given set of template /// arguments. /// /// This routine substitutes the given template arguments into the /// type T and produces the instantiated type. /// /// \param T the type into which the template arguments will be /// substituted. If this type is not dependent, it will be returned /// immediately. /// /// \param Args the template arguments that will be /// substituted for the top-level template parameters within T. /// /// \param Loc the location in the source code where this substitution /// is being performed. It will typically be the location of the /// declarator (if we're instantiating the type of some declaration) /// or the location of the type in the source code (if, e.g., we're /// instantiating the type of a cast expression). /// /// \param Entity the name of the entity associated with a declaration /// being instantiated (if any). May be empty to indicate that there /// is no such entity (if, e.g., this is a type that occurs as part of /// a cast expression) or that the entity has no name (e.g., an /// unnamed function parameter). /// /// \param AllowDeducedTST Whether a DeducedTemplateSpecializationType is /// acceptable as the top level type of the result. /// /// \returns If the instantiation succeeds, the instantiated /// type. Otherwise, produces diagnostics and returns a NULL type. TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T, const MultiLevelTemplateArgumentList &Args, SourceLocation Loc, DeclarationName Entity, bool AllowDeducedTST) { assert(!CodeSynthesisContexts.empty() && "Cannot perform an instantiation without some context on the " "instantiation stack"); if (!T->getType()->isInstantiationDependentType() && !T->getType()->isVariablyModifiedType()) return T; TemplateInstantiator Instantiator(*this, Args, Loc, Entity); return AllowDeducedTST ? Instantiator.TransformTypeWithDeducedTST(T) : Instantiator.TransformType(T); } TypeSourceInfo *Sema::SubstType(TypeLoc TL, const MultiLevelTemplateArgumentList &Args, SourceLocation Loc, DeclarationName Entity) { assert(!CodeSynthesisContexts.empty() && "Cannot perform an instantiation without some context on the " "instantiation stack"); if (TL.getType().isNull()) return nullptr; if (!TL.getType()->isInstantiationDependentType() && !TL.getType()->isVariablyModifiedType()) { // FIXME: Make a copy of the TypeLoc data here, so that we can // return a new TypeSourceInfo. Inefficient! TypeLocBuilder TLB; TLB.pushFullCopy(TL); return TLB.getTypeSourceInfo(Context, TL.getType()); } TemplateInstantiator Instantiator(*this, Args, Loc, Entity); TypeLocBuilder TLB; TLB.reserve(TL.getFullDataSize()); QualType Result = Instantiator.TransformType(TLB, TL); if (Result.isNull()) return nullptr; return TLB.getTypeSourceInfo(Context, Result); } /// Deprecated form of the above. QualType Sema::SubstType(QualType T, const MultiLevelTemplateArgumentList &TemplateArgs, SourceLocation Loc, DeclarationName Entity) { assert(!CodeSynthesisContexts.empty() && "Cannot perform an instantiation without some context on the " "instantiation stack"); // If T is not a dependent type or a variably-modified type, there // is nothing to do. if (!T->isInstantiationDependentType() && !T->isVariablyModifiedType()) return T; TemplateInstantiator Instantiator(*this, TemplateArgs, Loc, Entity); return Instantiator.TransformType(T); } static bool NeedsInstantiationAsFunctionType(TypeSourceInfo *T) { if (T->getType()->isInstantiationDependentType() || T->getType()->isVariablyModifiedType()) return true; TypeLoc TL = T->getTypeLoc().IgnoreParens(); if (!TL.getAs()) return false; FunctionProtoTypeLoc FP = TL.castAs(); for (ParmVarDecl *P : FP.getParams()) { // This must be synthesized from a typedef. if (!P) continue; // If there are any parameters, a new TypeSourceInfo that refers to the // instantiated parameters must be built. return true; } return false; } /// A form of SubstType intended specifically for instantiating the /// type of a FunctionDecl. Its purpose is solely to force the /// instantiation of default-argument expressions and to avoid /// instantiating an exception-specification. TypeSourceInfo *Sema::SubstFunctionDeclType(TypeSourceInfo *T, const MultiLevelTemplateArgumentList &Args, SourceLocation Loc, DeclarationName Entity, CXXRecordDecl *ThisContext, Qualifiers ThisTypeQuals, bool EvaluateConstraints) { assert(!CodeSynthesisContexts.empty() && "Cannot perform an instantiation without some context on the " "instantiation stack"); if (!NeedsInstantiationAsFunctionType(T)) return T; TemplateInstantiator Instantiator(*this, Args, Loc, Entity); Instantiator.setEvaluateConstraints(EvaluateConstraints); TypeLocBuilder TLB; TypeLoc TL = T->getTypeLoc(); TLB.reserve(TL.getFullDataSize()); QualType Result; if (FunctionProtoTypeLoc Proto = TL.IgnoreParens().getAs()) { // Instantiate the type, other than its exception specification. The // exception specification is instantiated in InitFunctionInstantiation // once we've built the FunctionDecl. // FIXME: Set the exception specification to EST_Uninstantiated here, // instead of rebuilding the function type again later. Result = Instantiator.TransformFunctionProtoType( TLB, Proto, ThisContext, ThisTypeQuals, [](FunctionProtoType::ExceptionSpecInfo &ESI, bool &Changed) { return false; }); } else { Result = Instantiator.TransformType(TLB, TL); } if (Result.isNull()) return nullptr; return TLB.getTypeSourceInfo(Context, Result); } bool Sema::SubstExceptionSpec(SourceLocation Loc, FunctionProtoType::ExceptionSpecInfo &ESI, SmallVectorImpl &ExceptionStorage, const MultiLevelTemplateArgumentList &Args) { assert(ESI.Type != EST_Uninstantiated); bool Changed = false; TemplateInstantiator Instantiator(*this, Args, Loc, DeclarationName()); return Instantiator.TransformExceptionSpec(Loc, ESI, ExceptionStorage, Changed); } void Sema::SubstExceptionSpec(FunctionDecl *New, const FunctionProtoType *Proto, const MultiLevelTemplateArgumentList &Args) { FunctionProtoType::ExceptionSpecInfo ESI = Proto->getExtProtoInfo().ExceptionSpec; SmallVector ExceptionStorage; if (SubstExceptionSpec(New->getTypeSourceInfo()->getTypeLoc().getEndLoc(), ESI, ExceptionStorage, Args)) // On error, recover by dropping the exception specification. ESI.Type = EST_None; UpdateExceptionSpec(New, ESI); } namespace { struct GetContainedInventedTypeParmVisitor : public TypeVisitor { using TypeVisitor::Visit; TemplateTypeParmDecl *Visit(QualType T) { if (T.isNull()) return nullptr; return Visit(T.getTypePtr()); } // The deduced type itself. TemplateTypeParmDecl *VisitTemplateTypeParmType( const TemplateTypeParmType *T) { if (!T->getDecl() || !T->getDecl()->isImplicit()) return nullptr; return T->getDecl(); } // Only these types can contain 'auto' types, and subsequently be replaced // by references to invented parameters. TemplateTypeParmDecl *VisitElaboratedType(const ElaboratedType *T) { return Visit(T->getNamedType()); } TemplateTypeParmDecl *VisitPointerType(const PointerType *T) { return Visit(T->getPointeeType()); } TemplateTypeParmDecl *VisitBlockPointerType(const BlockPointerType *T) { return Visit(T->getPointeeType()); } TemplateTypeParmDecl *VisitReferenceType(const ReferenceType *T) { return Visit(T->getPointeeTypeAsWritten()); } TemplateTypeParmDecl *VisitMemberPointerType(const MemberPointerType *T) { return Visit(T->getPointeeType()); } TemplateTypeParmDecl *VisitArrayType(const ArrayType *T) { return Visit(T->getElementType()); } TemplateTypeParmDecl *VisitDependentSizedExtVectorType( const DependentSizedExtVectorType *T) { return Visit(T->getElementType()); } TemplateTypeParmDecl *VisitVectorType(const VectorType *T) { return Visit(T->getElementType()); } TemplateTypeParmDecl *VisitFunctionProtoType(const FunctionProtoType *T) { return VisitFunctionType(T); } TemplateTypeParmDecl *VisitFunctionType(const FunctionType *T) { return Visit(T->getReturnType()); } TemplateTypeParmDecl *VisitParenType(const ParenType *T) { return Visit(T->getInnerType()); } TemplateTypeParmDecl *VisitAttributedType(const AttributedType *T) { return Visit(T->getModifiedType()); } TemplateTypeParmDecl *VisitMacroQualifiedType(const MacroQualifiedType *T) { return Visit(T->getUnderlyingType()); } TemplateTypeParmDecl *VisitAdjustedType(const AdjustedType *T) { return Visit(T->getOriginalType()); } TemplateTypeParmDecl *VisitPackExpansionType(const PackExpansionType *T) { return Visit(T->getPattern()); } }; } // namespace bool Sema::SubstTypeConstraint( TemplateTypeParmDecl *Inst, const TypeConstraint *TC, const MultiLevelTemplateArgumentList &TemplateArgs, bool EvaluateConstraints) { const ASTTemplateArgumentListInfo *TemplArgInfo = TC->getTemplateArgsAsWritten(); if (!EvaluateConstraints) { Inst->setTypeConstraint(TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(), TC->getNamedConcept(), TC->getNamedConcept(), TemplArgInfo, TC->getImmediatelyDeclaredConstraint()); return false; } TemplateArgumentListInfo InstArgs; if (TemplArgInfo) { InstArgs.setLAngleLoc(TemplArgInfo->LAngleLoc); InstArgs.setRAngleLoc(TemplArgInfo->RAngleLoc); if (SubstTemplateArguments(TemplArgInfo->arguments(), TemplateArgs, InstArgs)) return true; } return AttachTypeConstraint( TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(), TC->getNamedConcept(), &InstArgs, Inst, Inst->isParameterPack() ? cast(TC->getImmediatelyDeclaredConstraint()) ->getEllipsisLoc() : SourceLocation()); } ParmVarDecl *Sema::SubstParmVarDecl( ParmVarDecl *OldParm, const MultiLevelTemplateArgumentList &TemplateArgs, int indexAdjustment, std::optional NumExpansions, bool ExpectParameterPack, bool EvaluateConstraint) { TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo(); TypeSourceInfo *NewDI = nullptr; TypeLoc OldTL = OldDI->getTypeLoc(); if (PackExpansionTypeLoc ExpansionTL = OldTL.getAs()) { // We have a function parameter pack. Substitute into the pattern of the // expansion. NewDI = SubstType(ExpansionTL.getPatternLoc(), TemplateArgs, OldParm->getLocation(), OldParm->getDeclName()); if (!NewDI) return nullptr; if (NewDI->getType()->containsUnexpandedParameterPack()) { // We still have unexpanded parameter packs, which means that // our function parameter is still a function parameter pack. // Therefore, make its type a pack expansion type. NewDI = CheckPackExpansion(NewDI, ExpansionTL.getEllipsisLoc(), NumExpansions); } else if (ExpectParameterPack) { // We expected to get a parameter pack but didn't (because the type // itself is not a pack expansion type), so complain. This can occur when // the substitution goes through an alias template that "loses" the // pack expansion. Diag(OldParm->getLocation(), diag::err_function_parameter_pack_without_parameter_packs) << NewDI->getType(); return nullptr; } } else { NewDI = SubstType(OldDI, TemplateArgs, OldParm->getLocation(), OldParm->getDeclName()); } if (!NewDI) return nullptr; if (NewDI->getType()->isVoidType()) { Diag(OldParm->getLocation(), diag::err_param_with_void_type); return nullptr; } // In abbreviated templates, TemplateTypeParmDecls with possible // TypeConstraints are created when the parameter list is originally parsed. // The TypeConstraints can therefore reference other functions parameters in // the abbreviated function template, which is why we must instantiate them // here, when the instantiated versions of those referenced parameters are in // scope. if (TemplateTypeParmDecl *TTP = GetContainedInventedTypeParmVisitor().Visit(OldDI->getType())) { if (const TypeConstraint *TC = TTP->getTypeConstraint()) { auto *Inst = cast_or_null( FindInstantiatedDecl(TTP->getLocation(), TTP, TemplateArgs)); // We will first get here when instantiating the abbreviated function // template's described function, but we might also get here later. // Make sure we do not instantiate the TypeConstraint more than once. if (Inst && !Inst->getTypeConstraint()) { if (SubstTypeConstraint(Inst, TC, TemplateArgs, EvaluateConstraint)) return nullptr; } } } ParmVarDecl *NewParm = CheckParameter(Context.getTranslationUnitDecl(), OldParm->getInnerLocStart(), OldParm->getLocation(), OldParm->getIdentifier(), NewDI->getType(), NewDI, OldParm->getStorageClass()); if (!NewParm) return nullptr; // Mark the (new) default argument as uninstantiated (if any). if (OldParm->hasUninstantiatedDefaultArg()) { Expr *Arg = OldParm->getUninstantiatedDefaultArg(); NewParm->setUninstantiatedDefaultArg(Arg); } else if (OldParm->hasUnparsedDefaultArg()) { NewParm->setUnparsedDefaultArg(); UnparsedDefaultArgInstantiations[OldParm].push_back(NewParm); } else if (Expr *Arg = OldParm->getDefaultArg()) { // Default arguments cannot be substituted until the declaration context // for the associated function or lambda capture class is available. // This is necessary for cases like the following where construction of // the lambda capture class for the outer lambda is dependent on the // parameter types but where the default argument is dependent on the // outer lambda's declaration context. // template // auto f() { // return [](T = []{ return T{}; }()) { return 0; }; // } NewParm->setUninstantiatedDefaultArg(Arg); } NewParm->setHasInheritedDefaultArg(OldParm->hasInheritedDefaultArg()); if (OldParm->isParameterPack() && !NewParm->isParameterPack()) { // Add the new parameter to the instantiated parameter pack. CurrentInstantiationScope->InstantiatedLocalPackArg(OldParm, NewParm); } else { // Introduce an Old -> New mapping CurrentInstantiationScope->InstantiatedLocal(OldParm, NewParm); } // FIXME: OldParm may come from a FunctionProtoType, in which case CurContext // can be anything, is this right ? NewParm->setDeclContext(CurContext); NewParm->setScopeInfo(OldParm->getFunctionScopeDepth(), OldParm->getFunctionScopeIndex() + indexAdjustment); InstantiateAttrs(TemplateArgs, OldParm, NewParm); return NewParm; } /// Substitute the given template arguments into the given set of /// parameters, producing the set of parameter types that would be generated /// from such a substitution. bool Sema::SubstParmTypes( SourceLocation Loc, ArrayRef Params, const FunctionProtoType::ExtParameterInfo *ExtParamInfos, const MultiLevelTemplateArgumentList &TemplateArgs, SmallVectorImpl &ParamTypes, SmallVectorImpl *OutParams, ExtParameterInfoBuilder &ParamInfos) { assert(!CodeSynthesisContexts.empty() && "Cannot perform an instantiation without some context on the " "instantiation stack"); TemplateInstantiator Instantiator(*this, TemplateArgs, Loc, DeclarationName()); return Instantiator.TransformFunctionTypeParams( Loc, Params, nullptr, ExtParamInfos, ParamTypes, OutParams, ParamInfos); } /// Substitute the given template arguments into the default argument. bool Sema::SubstDefaultArgument( SourceLocation Loc, ParmVarDecl *Param, const MultiLevelTemplateArgumentList &TemplateArgs, bool ForCallExpr) { FunctionDecl *FD = cast(Param->getDeclContext()); Expr *PatternExpr = Param->getUninstantiatedDefaultArg(); EnterExpressionEvaluationContext EvalContext( *this, ExpressionEvaluationContext::PotentiallyEvaluated, Param); InstantiatingTemplate Inst(*this, Loc, Param, TemplateArgs.getInnermost()); if (Inst.isInvalid()) return true; if (Inst.isAlreadyInstantiating()) { Diag(Param->getBeginLoc(), diag::err_recursive_default_argument) << FD; Param->setInvalidDecl(); return true; } ExprResult Result; { // C++ [dcl.fct.default]p5: // The names in the [default argument] expression are bound, and // the semantic constraints are checked, at the point where the // default argument expression appears. ContextRAII SavedContext(*this, FD); std::unique_ptr LIS; if (ForCallExpr) { // When instantiating a default argument due to use in a call expression, // an instantiation scope that includes the parameters of the callee is // required to satisfy references from the default argument. For example: // template void f(T a, int = decltype(a)()); // void g() { f(0); } LIS = std::make_unique(*this); FunctionDecl *PatternFD = FD->getTemplateInstantiationPattern( /*ForDefinition*/ false); if (addInstantiatedParametersToScope(FD, PatternFD, *LIS, TemplateArgs)) return true; } runWithSufficientStackSpace(Loc, [&] { Result = SubstInitializer(PatternExpr, TemplateArgs, /*DirectInit*/false); }); } if (Result.isInvalid()) return true; if (ForCallExpr) { // Check the expression as an initializer for the parameter. InitializedEntity Entity = InitializedEntity::InitializeParameter(Context, Param); InitializationKind Kind = InitializationKind::CreateCopy( Param->getLocation(), /*FIXME:EqualLoc*/ PatternExpr->getBeginLoc()); Expr *ResultE = Result.getAs(); InitializationSequence InitSeq(*this, Entity, Kind, ResultE); Result = InitSeq.Perform(*this, Entity, Kind, ResultE); if (Result.isInvalid()) return true; Result = ActOnFinishFullExpr(Result.getAs(), Param->getOuterLocStart(), /*DiscardedValue*/ false); } else { // FIXME: Obtain the source location for the '=' token. SourceLocation EqualLoc = PatternExpr->getBeginLoc(); Result = ConvertParamDefaultArgument(Param, Result.getAs(), EqualLoc); } if (Result.isInvalid()) return true; // Remember the instantiated default argument. Param->setDefaultArg(Result.getAs()); return false; } /// Perform substitution on the base class specifiers of the /// given class template specialization. /// /// Produces a diagnostic and returns true on error, returns false and /// attaches the instantiated base classes to the class template /// specialization if successful. bool Sema::SubstBaseSpecifiers(CXXRecordDecl *Instantiation, CXXRecordDecl *Pattern, const MultiLevelTemplateArgumentList &TemplateArgs) { bool Invalid = false; SmallVector InstantiatedBases; for (const auto &Base : Pattern->bases()) { if (!Base.getType()->isDependentType()) { if (const CXXRecordDecl *RD = Base.getType()->getAsCXXRecordDecl()) { if (RD->isInvalidDecl()) Instantiation->setInvalidDecl(); } InstantiatedBases.push_back(new (Context) CXXBaseSpecifier(Base)); continue; } SourceLocation EllipsisLoc; TypeSourceInfo *BaseTypeLoc; if (Base.isPackExpansion()) { // This is a pack expansion. See whether we should expand it now, or // wait until later. SmallVector Unexpanded; collectUnexpandedParameterPacks(Base.getTypeSourceInfo()->getTypeLoc(), Unexpanded); bool ShouldExpand = false; bool RetainExpansion = false; std::optional NumExpansions; if (CheckParameterPacksForExpansion(Base.getEllipsisLoc(), Base.getSourceRange(), Unexpanded, TemplateArgs, ShouldExpand, RetainExpansion, NumExpansions)) { Invalid = true; continue; } // If we should expand this pack expansion now, do so. if (ShouldExpand) { for (unsigned I = 0; I != *NumExpansions; ++I) { Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, I); TypeSourceInfo *BaseTypeLoc = SubstType(Base.getTypeSourceInfo(), TemplateArgs, Base.getSourceRange().getBegin(), DeclarationName()); if (!BaseTypeLoc) { Invalid = true; continue; } if (CXXBaseSpecifier *InstantiatedBase = CheckBaseSpecifier(Instantiation, Base.getSourceRange(), Base.isVirtual(), Base.getAccessSpecifierAsWritten(), BaseTypeLoc, SourceLocation())) InstantiatedBases.push_back(InstantiatedBase); else Invalid = true; } continue; } // The resulting base specifier will (still) be a pack expansion. EllipsisLoc = Base.getEllipsisLoc(); Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, -1); BaseTypeLoc = SubstType(Base.getTypeSourceInfo(), TemplateArgs, Base.getSourceRange().getBegin(), DeclarationName()); } else { BaseTypeLoc = SubstType(Base.getTypeSourceInfo(), TemplateArgs, Base.getSourceRange().getBegin(), DeclarationName()); } if (!BaseTypeLoc) { Invalid = true; continue; } if (CXXBaseSpecifier *InstantiatedBase = CheckBaseSpecifier(Instantiation, Base.getSourceRange(), Base.isVirtual(), Base.getAccessSpecifierAsWritten(), BaseTypeLoc, EllipsisLoc)) InstantiatedBases.push_back(InstantiatedBase); else Invalid = true; } if (!Invalid && AttachBaseSpecifiers(Instantiation, InstantiatedBases)) Invalid = true; return Invalid; } // Defined via #include from SemaTemplateInstantiateDecl.cpp namespace clang { namespace sema { Attr *instantiateTemplateAttribute(const Attr *At, ASTContext &C, Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs); Attr *instantiateTemplateAttributeForDecl( const Attr *At, ASTContext &C, Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs); } } /// Instantiate the definition of a class from a given pattern. /// /// \param PointOfInstantiation The point of instantiation within the /// source code. /// /// \param Instantiation is the declaration whose definition is being /// instantiated. This will be either a class template specialization /// or a member class of a class template specialization. /// /// \param Pattern is the pattern from which the instantiation /// occurs. This will be either the declaration of a class template or /// the declaration of a member class of a class template. /// /// \param TemplateArgs The template arguments to be substituted into /// the pattern. /// /// \param TSK the kind of implicit or explicit instantiation to perform. /// /// \param Complain whether to complain if the class cannot be instantiated due /// to the lack of a definition. /// /// \returns true if an error occurred, false otherwise. bool Sema::InstantiateClass(SourceLocation PointOfInstantiation, CXXRecordDecl *Instantiation, CXXRecordDecl *Pattern, const MultiLevelTemplateArgumentList &TemplateArgs, TemplateSpecializationKind TSK, bool Complain) { CXXRecordDecl *PatternDef = cast_or_null(Pattern->getDefinition()); if (DiagnoseUninstantiableTemplate(PointOfInstantiation, Instantiation, Instantiation->getInstantiatedFromMemberClass(), Pattern, PatternDef, TSK, Complain)) return true; llvm::TimeTraceScope TimeScope("InstantiateClass", [&]() { std::string Name; llvm::raw_string_ostream OS(Name); Instantiation->getNameForDiagnostic(OS, getPrintingPolicy(), /*Qualified=*/true); return Name; }); Pattern = PatternDef; // Record the point of instantiation. if (MemberSpecializationInfo *MSInfo = Instantiation->getMemberSpecializationInfo()) { MSInfo->setTemplateSpecializationKind(TSK); MSInfo->setPointOfInstantiation(PointOfInstantiation); } else if (ClassTemplateSpecializationDecl *Spec = dyn_cast(Instantiation)) { Spec->setTemplateSpecializationKind(TSK); Spec->setPointOfInstantiation(PointOfInstantiation); } InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation); if (Inst.isInvalid()) return true; assert(!Inst.isAlreadyInstantiating() && "should have been caught by caller"); PrettyDeclStackTraceEntry CrashInfo(Context, Instantiation, SourceLocation(), "instantiating class definition"); // Enter the scope of this instantiation. We don't use // PushDeclContext because we don't have a scope. ContextRAII SavedContext(*this, Instantiation); EnterExpressionEvaluationContext EvalContext( *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated); // If this is an instantiation of a local class, merge this local // instantiation scope with the enclosing scope. Otherwise, every // instantiation of a class has its own local instantiation scope. bool MergeWithParentScope = !Instantiation->isDefinedOutsideFunctionOrMethod(); LocalInstantiationScope Scope(*this, MergeWithParentScope); // Some class state isn't processed immediately but delayed till class // instantiation completes. We may not be ready to handle any delayed state // already on the stack as it might correspond to a different class, so save // it now and put it back later. SavePendingParsedClassStateRAII SavedPendingParsedClassState(*this); // Pull attributes from the pattern onto the instantiation. InstantiateAttrs(TemplateArgs, Pattern, Instantiation); // Start the definition of this instantiation. Instantiation->startDefinition(); // The instantiation is visible here, even if it was first declared in an // unimported module. Instantiation->setVisibleDespiteOwningModule(); // FIXME: This loses the as-written tag kind for an explicit instantiation. Instantiation->setTagKind(Pattern->getTagKind()); // Do substitution on the base class specifiers. if (SubstBaseSpecifiers(Instantiation, Pattern, TemplateArgs)) Instantiation->setInvalidDecl(); TemplateDeclInstantiator Instantiator(*this, Instantiation, TemplateArgs); Instantiator.setEvaluateConstraints(false); SmallVector Fields; // Delay instantiation of late parsed attributes. LateInstantiatedAttrVec LateAttrs; Instantiator.enableLateAttributeInstantiation(&LateAttrs); bool MightHaveConstexprVirtualFunctions = false; for (auto *Member : Pattern->decls()) { // Don't instantiate members not belonging in this semantic context. // e.g. for: // @code // template class A { // class B *g; // }; // @endcode // 'class B' has the template as lexical context but semantically it is // introduced in namespace scope. if (Member->getDeclContext() != Pattern) continue; // BlockDecls can appear in a default-member-initializer. They must be the // child of a BlockExpr, so we only know how to instantiate them from there. // Similarly, lambda closure types are recreated when instantiating the // corresponding LambdaExpr. if (isa(Member) || (isa(Member) && cast(Member)->isLambda())) continue; if (Member->isInvalidDecl()) { Instantiation->setInvalidDecl(); continue; } Decl *NewMember = Instantiator.Visit(Member); if (NewMember) { if (FieldDecl *Field = dyn_cast(NewMember)) { Fields.push_back(Field); } else if (EnumDecl *Enum = dyn_cast(NewMember)) { // C++11 [temp.inst]p1: The implicit instantiation of a class template // specialization causes the implicit instantiation of the definitions // of unscoped member enumerations. // Record a point of instantiation for this implicit instantiation. if (TSK == TSK_ImplicitInstantiation && !Enum->isScoped() && Enum->isCompleteDefinition()) { MemberSpecializationInfo *MSInfo =Enum->getMemberSpecializationInfo(); assert(MSInfo && "no spec info for member enum specialization"); MSInfo->setTemplateSpecializationKind(TSK_ImplicitInstantiation); MSInfo->setPointOfInstantiation(PointOfInstantiation); } } else if (StaticAssertDecl *SA = dyn_cast(NewMember)) { if (SA->isFailed()) { // A static_assert failed. Bail out; instantiating this // class is probably not meaningful. Instantiation->setInvalidDecl(); break; } } else if (CXXMethodDecl *MD = dyn_cast(NewMember)) { if (MD->isConstexpr() && !MD->getFriendObjectKind() && (MD->isVirtualAsWritten() || Instantiation->getNumBases())) MightHaveConstexprVirtualFunctions = true; } if (NewMember->isInvalidDecl()) Instantiation->setInvalidDecl(); } else { // FIXME: Eventually, a NULL return will mean that one of the // instantiations was a semantic disaster, and we'll want to mark the // declaration invalid. // For now, we expect to skip some members that we can't yet handle. } } // Finish checking fields. ActOnFields(nullptr, Instantiation->getLocation(), Instantiation, Fields, SourceLocation(), SourceLocation(), ParsedAttributesView()); CheckCompletedCXXClass(nullptr, Instantiation); // Default arguments are parsed, if not instantiated. We can go instantiate // default arg exprs for default constructors if necessary now. Unless we're // parsing a class, in which case wait until that's finished. if (ParsingClassDepth == 0) ActOnFinishCXXNonNestedClass(); // Instantiate late parsed attributes, and attach them to their decls. // See Sema::InstantiateAttrs for (LateInstantiatedAttrVec::iterator I = LateAttrs.begin(), E = LateAttrs.end(); I != E; ++I) { assert(CurrentInstantiationScope == Instantiator.getStartingScope()); CurrentInstantiationScope = I->Scope; // Allow 'this' within late-parsed attributes. auto *ND = cast(I->NewDecl); auto *ThisContext = dyn_cast_or_null(ND->getDeclContext()); CXXThisScopeRAII ThisScope(*this, ThisContext, Qualifiers(), ND->isCXXInstanceMember()); Attr *NewAttr = instantiateTemplateAttribute(I->TmplAttr, Context, *this, TemplateArgs); if (NewAttr) I->NewDecl->addAttr(NewAttr); LocalInstantiationScope::deleteScopes(I->Scope, Instantiator.getStartingScope()); } Instantiator.disableLateAttributeInstantiation(); LateAttrs.clear(); ActOnFinishDelayedMemberInitializers(Instantiation); // FIXME: We should do something similar for explicit instantiations so they // end up in the right module. if (TSK == TSK_ImplicitInstantiation) { Instantiation->setLocation(Pattern->getLocation()); Instantiation->setLocStart(Pattern->getInnerLocStart()); Instantiation->setBraceRange(Pattern->getBraceRange()); } if (!Instantiation->isInvalidDecl()) { // Perform any dependent diagnostics from the pattern. if (Pattern->isDependentContext()) PerformDependentDiagnostics(Pattern, TemplateArgs); // Instantiate any out-of-line class template partial // specializations now. for (TemplateDeclInstantiator::delayed_partial_spec_iterator P = Instantiator.delayed_partial_spec_begin(), PEnd = Instantiator.delayed_partial_spec_end(); P != PEnd; ++P) { if (!Instantiator.InstantiateClassTemplatePartialSpecialization( P->first, P->second)) { Instantiation->setInvalidDecl(); break; } } // Instantiate any out-of-line variable template partial // specializations now. for (TemplateDeclInstantiator::delayed_var_partial_spec_iterator P = Instantiator.delayed_var_partial_spec_begin(), PEnd = Instantiator.delayed_var_partial_spec_end(); P != PEnd; ++P) { if (!Instantiator.InstantiateVarTemplatePartialSpecialization( P->first, P->second)) { Instantiation->setInvalidDecl(); break; } } } // Exit the scope of this instantiation. SavedContext.pop(); if (!Instantiation->isInvalidDecl()) { // Always emit the vtable for an explicit instantiation definition // of a polymorphic class template specialization. Otherwise, eagerly // instantiate only constexpr virtual functions in preparation for their use // in constant evaluation. if (TSK == TSK_ExplicitInstantiationDefinition) MarkVTableUsed(PointOfInstantiation, Instantiation, true); else if (MightHaveConstexprVirtualFunctions) MarkVirtualMembersReferenced(PointOfInstantiation, Instantiation, /*ConstexprOnly*/ true); } Consumer.HandleTagDeclDefinition(Instantiation); return Instantiation->isInvalidDecl(); } /// Instantiate the definition of an enum from a given pattern. /// /// \param PointOfInstantiation The point of instantiation within the /// source code. /// \param Instantiation is the declaration whose definition is being /// instantiated. This will be a member enumeration of a class /// temploid specialization, or a local enumeration within a /// function temploid specialization. /// \param Pattern The templated declaration from which the instantiation /// occurs. /// \param TemplateArgs The template arguments to be substituted into /// the pattern. /// \param TSK The kind of implicit or explicit instantiation to perform. /// /// \return \c true if an error occurred, \c false otherwise. bool Sema::InstantiateEnum(SourceLocation PointOfInstantiation, EnumDecl *Instantiation, EnumDecl *Pattern, const MultiLevelTemplateArgumentList &TemplateArgs, TemplateSpecializationKind TSK) { EnumDecl *PatternDef = Pattern->getDefinition(); if (DiagnoseUninstantiableTemplate(PointOfInstantiation, Instantiation, Instantiation->getInstantiatedFromMemberEnum(), Pattern, PatternDef, TSK,/*Complain*/true)) return true; Pattern = PatternDef; // Record the point of instantiation. if (MemberSpecializationInfo *MSInfo = Instantiation->getMemberSpecializationInfo()) { MSInfo->setTemplateSpecializationKind(TSK); MSInfo->setPointOfInstantiation(PointOfInstantiation); } InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation); if (Inst.isInvalid()) return true; if (Inst.isAlreadyInstantiating()) return false; PrettyDeclStackTraceEntry CrashInfo(Context, Instantiation, SourceLocation(), "instantiating enum definition"); // The instantiation is visible here, even if it was first declared in an // unimported module. Instantiation->setVisibleDespiteOwningModule(); // Enter the scope of this instantiation. We don't use // PushDeclContext because we don't have a scope. ContextRAII SavedContext(*this, Instantiation); EnterExpressionEvaluationContext EvalContext( *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated); LocalInstantiationScope Scope(*this, /*MergeWithParentScope*/true); // Pull attributes from the pattern onto the instantiation. InstantiateAttrs(TemplateArgs, Pattern, Instantiation); TemplateDeclInstantiator Instantiator(*this, Instantiation, TemplateArgs); Instantiator.InstantiateEnumDefinition(Instantiation, Pattern); // Exit the scope of this instantiation. SavedContext.pop(); return Instantiation->isInvalidDecl(); } /// Instantiate the definition of a field from the given pattern. /// /// \param PointOfInstantiation The point of instantiation within the /// source code. /// \param Instantiation is the declaration whose definition is being /// instantiated. This will be a class of a class temploid /// specialization, or a local enumeration within a function temploid /// specialization. /// \param Pattern The templated declaration from which the instantiation /// occurs. /// \param TemplateArgs The template arguments to be substituted into /// the pattern. /// /// \return \c true if an error occurred, \c false otherwise. bool Sema::InstantiateInClassInitializer( SourceLocation PointOfInstantiation, FieldDecl *Instantiation, FieldDecl *Pattern, const MultiLevelTemplateArgumentList &TemplateArgs) { // If there is no initializer, we don't need to do anything. if (!Pattern->hasInClassInitializer()) return false; assert(Instantiation->getInClassInitStyle() == Pattern->getInClassInitStyle() && "pattern and instantiation disagree about init style"); // Error out if we haven't parsed the initializer of the pattern yet because // we are waiting for the closing brace of the outer class. Expr *OldInit = Pattern->getInClassInitializer(); if (!OldInit) { RecordDecl *PatternRD = Pattern->getParent(); RecordDecl *OutermostClass = PatternRD->getOuterLexicalRecordContext(); Diag(PointOfInstantiation, diag::err_default_member_initializer_not_yet_parsed) << OutermostClass << Pattern; Diag(Pattern->getEndLoc(), diag::note_default_member_initializer_not_yet_parsed); Instantiation->setInvalidDecl(); return true; } InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation); if (Inst.isInvalid()) return true; if (Inst.isAlreadyInstantiating()) { // Error out if we hit an instantiation cycle for this initializer. Diag(PointOfInstantiation, diag::err_default_member_initializer_cycle) << Instantiation; return true; } PrettyDeclStackTraceEntry CrashInfo(Context, Instantiation, SourceLocation(), "instantiating default member init"); // Enter the scope of this instantiation. We don't use PushDeclContext because // we don't have a scope. ContextRAII SavedContext(*this, Instantiation->getParent()); EnterExpressionEvaluationContext EvalContext( *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated); ExprEvalContexts.back().DelayedDefaultInitializationContext = { PointOfInstantiation, Instantiation, CurContext}; LocalInstantiationScope Scope(*this, true); // Instantiate the initializer. ActOnStartCXXInClassMemberInitializer(); CXXThisScopeRAII ThisScope(*this, Instantiation->getParent(), Qualifiers()); ExprResult NewInit = SubstInitializer(OldInit, TemplateArgs, /*CXXDirectInit=*/false); Expr *Init = NewInit.get(); assert((!Init || !isa(Init)) && "call-style init in class"); ActOnFinishCXXInClassMemberInitializer( Instantiation, Init ? Init->getBeginLoc() : SourceLocation(), Init); if (auto *L = getASTMutationListener()) L->DefaultMemberInitializerInstantiated(Instantiation); // Return true if the in-class initializer is still missing. return !Instantiation->getInClassInitializer(); } namespace { /// A partial specialization whose template arguments have matched /// a given template-id. struct PartialSpecMatchResult { ClassTemplatePartialSpecializationDecl *Partial; TemplateArgumentList *Args; }; } bool Sema::usesPartialOrExplicitSpecialization( SourceLocation Loc, ClassTemplateSpecializationDecl *ClassTemplateSpec) { if (ClassTemplateSpec->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) return true; SmallVector PartialSpecs; ClassTemplateSpec->getSpecializedTemplate() ->getPartialSpecializations(PartialSpecs); for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) { TemplateDeductionInfo Info(Loc); if (!DeduceTemplateArguments(PartialSpecs[I], ClassTemplateSpec->getTemplateArgs(), Info)) return true; } return false; } /// Get the instantiation pattern to use to instantiate the definition of a /// given ClassTemplateSpecializationDecl (either the pattern of the primary /// template or of a partial specialization). static ActionResult getPatternForClassTemplateSpecialization( Sema &S, SourceLocation PointOfInstantiation, ClassTemplateSpecializationDecl *ClassTemplateSpec, TemplateSpecializationKind TSK) { Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec); if (Inst.isInvalid()) return {/*Invalid=*/true}; if (Inst.isAlreadyInstantiating()) return {/*Invalid=*/false}; llvm::PointerUnion Specialized = ClassTemplateSpec->getSpecializedTemplateOrPartial(); if (!Specialized.is()) { // Find best matching specialization. ClassTemplateDecl *Template = ClassTemplateSpec->getSpecializedTemplate(); // C++ [temp.class.spec.match]p1: // When a class template is used in a context that requires an // instantiation of the class, it is necessary to determine // whether the instantiation is to be generated using the primary // template or one of the partial specializations. This is done by // matching the template arguments of the class template // specialization with the template argument lists of the partial // specializations. typedef PartialSpecMatchResult MatchResult; SmallVector Matched; SmallVector PartialSpecs; Template->getPartialSpecializations(PartialSpecs); TemplateSpecCandidateSet FailedCandidates(PointOfInstantiation); for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) { ClassTemplatePartialSpecializationDecl *Partial = PartialSpecs[I]; TemplateDeductionInfo Info(FailedCandidates.getLocation()); if (Sema::TemplateDeductionResult Result = S.DeduceTemplateArguments( Partial, ClassTemplateSpec->getTemplateArgs(), Info)) { // Store the failed-deduction information for use in diagnostics, later. // TODO: Actually use the failed-deduction info? FailedCandidates.addCandidate().set( DeclAccessPair::make(Template, AS_public), Partial, MakeDeductionFailureInfo(S.Context, Result, Info)); (void)Result; } else { Matched.push_back(PartialSpecMatchResult()); Matched.back().Partial = Partial; Matched.back().Args = Info.takeCanonical(); } } // If we're dealing with a member template where the template parameters // have been instantiated, this provides the original template parameters // from which the member template's parameters were instantiated. if (Matched.size() >= 1) { SmallVectorImpl::iterator Best = Matched.begin(); if (Matched.size() == 1) { // -- If exactly one matching specialization is found, the // instantiation is generated from that specialization. // We don't need to do anything for this. } else { // -- If more than one matching specialization is found, the // partial order rules (14.5.4.2) are used to determine // whether one of the specializations is more specialized // than the others. If none of the specializations is more // specialized than all of the other matching // specializations, then the use of the class template is // ambiguous and the program is ill-formed. for (SmallVectorImpl::iterator P = Best + 1, PEnd = Matched.end(); P != PEnd; ++P) { if (S.getMoreSpecializedPartialSpecialization( P->Partial, Best->Partial, PointOfInstantiation) == P->Partial) Best = P; } // Determine if the best partial specialization is more specialized than // the others. bool Ambiguous = false; for (SmallVectorImpl::iterator P = Matched.begin(), PEnd = Matched.end(); P != PEnd; ++P) { if (P != Best && S.getMoreSpecializedPartialSpecialization( P->Partial, Best->Partial, PointOfInstantiation) != Best->Partial) { Ambiguous = true; break; } } if (Ambiguous) { // Partial ordering did not produce a clear winner. Complain. Inst.Clear(); ClassTemplateSpec->setInvalidDecl(); S.Diag(PointOfInstantiation, diag::err_partial_spec_ordering_ambiguous) << ClassTemplateSpec; // Print the matching partial specializations. for (SmallVectorImpl::iterator P = Matched.begin(), PEnd = Matched.end(); P != PEnd; ++P) S.Diag(P->Partial->getLocation(), diag::note_partial_spec_match) << S.getTemplateArgumentBindingsText( P->Partial->getTemplateParameters(), *P->Args); return {/*Invalid=*/true}; } } ClassTemplateSpec->setInstantiationOf(Best->Partial, Best->Args); } else { // -- If no matches are found, the instantiation is generated // from the primary template. } } CXXRecordDecl *Pattern = nullptr; Specialized = ClassTemplateSpec->getSpecializedTemplateOrPartial(); if (auto *PartialSpec = Specialized.dyn_cast()) { // Instantiate using the best class template partial specialization. while (PartialSpec->getInstantiatedFromMember()) { // If we've found an explicit specialization of this class template, // stop here and use that as the pattern. if (PartialSpec->isMemberSpecialization()) break; PartialSpec = PartialSpec->getInstantiatedFromMember(); } Pattern = PartialSpec; } else { ClassTemplateDecl *Template = ClassTemplateSpec->getSpecializedTemplate(); while (Template->getInstantiatedFromMemberTemplate()) { // If we've found an explicit specialization of this class template, // stop here and use that as the pattern. if (Template->isMemberSpecialization()) break; Template = Template->getInstantiatedFromMemberTemplate(); } Pattern = Template->getTemplatedDecl(); } return Pattern; } bool Sema::InstantiateClassTemplateSpecialization( SourceLocation PointOfInstantiation, ClassTemplateSpecializationDecl *ClassTemplateSpec, TemplateSpecializationKind TSK, bool Complain) { // Perform the actual instantiation on the canonical declaration. ClassTemplateSpec = cast( ClassTemplateSpec->getCanonicalDecl()); if (ClassTemplateSpec->isInvalidDecl()) return true; ActionResult Pattern = getPatternForClassTemplateSpecialization(*this, PointOfInstantiation, ClassTemplateSpec, TSK); if (!Pattern.isUsable()) return Pattern.isInvalid(); return InstantiateClass( PointOfInstantiation, ClassTemplateSpec, Pattern.get(), getTemplateInstantiationArgs(ClassTemplateSpec), TSK, Complain); } /// Instantiates the definitions of all of the member /// of the given class, which is an instantiation of a class template /// or a member class of a template. void Sema::InstantiateClassMembers(SourceLocation PointOfInstantiation, CXXRecordDecl *Instantiation, const MultiLevelTemplateArgumentList &TemplateArgs, TemplateSpecializationKind TSK) { // FIXME: We need to notify the ASTMutationListener that we did all of these // things, in case we have an explicit instantiation definition in a PCM, a // module, or preamble, and the declaration is in an imported AST. assert( (TSK == TSK_ExplicitInstantiationDefinition || TSK == TSK_ExplicitInstantiationDeclaration || (TSK == TSK_ImplicitInstantiation && Instantiation->isLocalClass())) && "Unexpected template specialization kind!"); for (auto *D : Instantiation->decls()) { bool SuppressNew = false; if (auto *Function = dyn_cast(D)) { if (FunctionDecl *Pattern = Function->getInstantiatedFromMemberFunction()) { if (Function->isIneligibleOrNotSelected()) continue; if (Function->getTrailingRequiresClause()) { ConstraintSatisfaction Satisfaction; if (CheckFunctionConstraints(Function, Satisfaction) || !Satisfaction.IsSatisfied) { continue; } } if (Function->hasAttr()) continue; MemberSpecializationInfo *MSInfo = Function->getMemberSpecializationInfo(); assert(MSInfo && "No member specialization information?"); if (MSInfo->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) continue; if (CheckSpecializationInstantiationRedecl(PointOfInstantiation, TSK, Function, MSInfo->getTemplateSpecializationKind(), MSInfo->getPointOfInstantiation(), SuppressNew) || SuppressNew) continue; // C++11 [temp.explicit]p8: // An explicit instantiation definition that names a class template // specialization explicitly instantiates the class template // specialization and is only an explicit instantiation definition // of members whose definition is visible at the point of // instantiation. if (TSK == TSK_ExplicitInstantiationDefinition && !Pattern->isDefined()) continue; Function->setTemplateSpecializationKind(TSK, PointOfInstantiation); if (Function->isDefined()) { // Let the ASTConsumer know that this function has been explicitly // instantiated now, and its linkage might have changed. Consumer.HandleTopLevelDecl(DeclGroupRef(Function)); } else if (TSK == TSK_ExplicitInstantiationDefinition) { InstantiateFunctionDefinition(PointOfInstantiation, Function); } else if (TSK == TSK_ImplicitInstantiation) { PendingLocalImplicitInstantiations.push_back( std::make_pair(Function, PointOfInstantiation)); } } } else if (auto *Var = dyn_cast(D)) { if (isa(Var)) continue; if (Var->isStaticDataMember()) { if (Var->hasAttr()) continue; MemberSpecializationInfo *MSInfo = Var->getMemberSpecializationInfo(); assert(MSInfo && "No member specialization information?"); if (MSInfo->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) continue; if (CheckSpecializationInstantiationRedecl(PointOfInstantiation, TSK, Var, MSInfo->getTemplateSpecializationKind(), MSInfo->getPointOfInstantiation(), SuppressNew) || SuppressNew) continue; if (TSK == TSK_ExplicitInstantiationDefinition) { // C++0x [temp.explicit]p8: // An explicit instantiation definition that names a class template // specialization explicitly instantiates the class template // specialization and is only an explicit instantiation definition // of members whose definition is visible at the point of // instantiation. if (!Var->getInstantiatedFromStaticDataMember()->getDefinition()) continue; Var->setTemplateSpecializationKind(TSK, PointOfInstantiation); InstantiateVariableDefinition(PointOfInstantiation, Var); } else { Var->setTemplateSpecializationKind(TSK, PointOfInstantiation); } } } else if (auto *Record = dyn_cast(D)) { if (Record->hasAttr()) continue; // Always skip the injected-class-name, along with any // redeclarations of nested classes, since both would cause us // to try to instantiate the members of a class twice. // Skip closure types; they'll get instantiated when we instantiate // the corresponding lambda-expression. if (Record->isInjectedClassName() || Record->getPreviousDecl() || Record->isLambda()) continue; MemberSpecializationInfo *MSInfo = Record->getMemberSpecializationInfo(); assert(MSInfo && "No member specialization information?"); if (MSInfo->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) continue; if (Context.getTargetInfo().getTriple().isOSWindows() && TSK == TSK_ExplicitInstantiationDeclaration) { // On Windows, explicit instantiation decl of the outer class doesn't // affect the inner class. Typically extern template declarations are // used in combination with dll import/export annotations, but those // are not propagated from the outer class templates to inner classes. // Therefore, do not instantiate inner classes on this platform, so // that users don't end up with undefined symbols during linking. continue; } if (CheckSpecializationInstantiationRedecl(PointOfInstantiation, TSK, Record, MSInfo->getTemplateSpecializationKind(), MSInfo->getPointOfInstantiation(), SuppressNew) || SuppressNew) continue; CXXRecordDecl *Pattern = Record->getInstantiatedFromMemberClass(); assert(Pattern && "Missing instantiated-from-template information"); if (!Record->getDefinition()) { if (!Pattern->getDefinition()) { // C++0x [temp.explicit]p8: // An explicit instantiation definition that names a class template // specialization explicitly instantiates the class template // specialization and is only an explicit instantiation definition // of members whose definition is visible at the point of // instantiation. if (TSK == TSK_ExplicitInstantiationDeclaration) { MSInfo->setTemplateSpecializationKind(TSK); MSInfo->setPointOfInstantiation(PointOfInstantiation); } continue; } InstantiateClass(PointOfInstantiation, Record, Pattern, TemplateArgs, TSK); } else { if (TSK == TSK_ExplicitInstantiationDefinition && Record->getTemplateSpecializationKind() == TSK_ExplicitInstantiationDeclaration) { Record->setTemplateSpecializationKind(TSK); MarkVTableUsed(PointOfInstantiation, Record, true); } } Pattern = cast_or_null(Record->getDefinition()); if (Pattern) InstantiateClassMembers(PointOfInstantiation, Pattern, TemplateArgs, TSK); } else if (auto *Enum = dyn_cast(D)) { MemberSpecializationInfo *MSInfo = Enum->getMemberSpecializationInfo(); assert(MSInfo && "No member specialization information?"); if (MSInfo->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) continue; if (CheckSpecializationInstantiationRedecl( PointOfInstantiation, TSK, Enum, MSInfo->getTemplateSpecializationKind(), MSInfo->getPointOfInstantiation(), SuppressNew) || SuppressNew) continue; if (Enum->getDefinition()) continue; EnumDecl *Pattern = Enum->getTemplateInstantiationPattern(); assert(Pattern && "Missing instantiated-from-template information"); if (TSK == TSK_ExplicitInstantiationDefinition) { if (!Pattern->getDefinition()) continue; InstantiateEnum(PointOfInstantiation, Enum, Pattern, TemplateArgs, TSK); } else { MSInfo->setTemplateSpecializationKind(TSK); MSInfo->setPointOfInstantiation(PointOfInstantiation); } } else if (auto *Field = dyn_cast(D)) { // No need to instantiate in-class initializers during explicit // instantiation. if (Field->hasInClassInitializer() && TSK == TSK_ImplicitInstantiation) { CXXRecordDecl *ClassPattern = Instantiation->getTemplateInstantiationPattern(); DeclContext::lookup_result Lookup = ClassPattern->lookup(Field->getDeclName()); FieldDecl *Pattern = Lookup.find_first(); assert(Pattern); InstantiateInClassInitializer(PointOfInstantiation, Field, Pattern, TemplateArgs); } } } } /// Instantiate the definitions of all of the members of the /// given class template specialization, which was named as part of an /// explicit instantiation. void Sema::InstantiateClassTemplateSpecializationMembers( SourceLocation PointOfInstantiation, ClassTemplateSpecializationDecl *ClassTemplateSpec, TemplateSpecializationKind TSK) { // C++0x [temp.explicit]p7: // An explicit instantiation that names a class template // specialization is an explicit instantion of the same kind // (declaration or definition) of each of its members (not // including members inherited from base classes) that has not // been previously explicitly specialized in the translation unit // containing the explicit instantiation, except as described // below. InstantiateClassMembers(PointOfInstantiation, ClassTemplateSpec, getTemplateInstantiationArgs(ClassTemplateSpec), TSK); } StmtResult Sema::SubstStmt(Stmt *S, const MultiLevelTemplateArgumentList &TemplateArgs) { if (!S) return S; TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(), DeclarationName()); return Instantiator.TransformStmt(S); } bool Sema::SubstTemplateArguments( ArrayRef Args, const MultiLevelTemplateArgumentList &TemplateArgs, TemplateArgumentListInfo &Out) { TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(), DeclarationName()); return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out); } ExprResult Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) { if (!E) return E; TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(), DeclarationName()); return Instantiator.TransformExpr(E); } ExprResult Sema::SubstConstraintExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) { if (!E) return E; // This is where we need to make sure we 'know' constraint checking needs to // happen. TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(), DeclarationName()); return Instantiator.TransformExpr(E); } ExprResult Sema::SubstInitializer(Expr *Init, const MultiLevelTemplateArgumentList &TemplateArgs, bool CXXDirectInit) { TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(), DeclarationName()); return Instantiator.TransformInitializer(Init, CXXDirectInit); } bool Sema::SubstExprs(ArrayRef Exprs, bool IsCall, const MultiLevelTemplateArgumentList &TemplateArgs, SmallVectorImpl &Outputs) { if (Exprs.empty()) return false; TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(), DeclarationName()); return Instantiator.TransformExprs(Exprs.data(), Exprs.size(), IsCall, Outputs); } NestedNameSpecifierLoc Sema::SubstNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS, const MultiLevelTemplateArgumentList &TemplateArgs) { if (!NNS) return NestedNameSpecifierLoc(); TemplateInstantiator Instantiator(*this, TemplateArgs, NNS.getBeginLoc(), DeclarationName()); return Instantiator.TransformNestedNameSpecifierLoc(NNS); } /// Do template substitution on declaration name info. DeclarationNameInfo Sema::SubstDeclarationNameInfo(const DeclarationNameInfo &NameInfo, const MultiLevelTemplateArgumentList &TemplateArgs) { TemplateInstantiator Instantiator(*this, TemplateArgs, NameInfo.getLoc(), NameInfo.getName()); return Instantiator.TransformDeclarationNameInfo(NameInfo); } TemplateName Sema::SubstTemplateName(NestedNameSpecifierLoc QualifierLoc, TemplateName Name, SourceLocation Loc, const MultiLevelTemplateArgumentList &TemplateArgs) { TemplateInstantiator Instantiator(*this, TemplateArgs, Loc, DeclarationName()); CXXScopeSpec SS; SS.Adopt(QualifierLoc); return Instantiator.TransformTemplateName(SS, Name, Loc); } static const Decl *getCanonicalParmVarDecl(const Decl *D) { // When storing ParmVarDecls in the local instantiation scope, we always // want to use the ParmVarDecl from the canonical function declaration, // since the map is then valid for any redeclaration or definition of that // function. if (const ParmVarDecl *PV = dyn_cast(D)) { if (const FunctionDecl *FD = dyn_cast(PV->getDeclContext())) { unsigned i = PV->getFunctionScopeIndex(); // This parameter might be from a freestanding function type within the // function and isn't necessarily referring to one of FD's parameters. if (i < FD->getNumParams() && FD->getParamDecl(i) == PV) return FD->getCanonicalDecl()->getParamDecl(i); } } return D; } llvm::PointerUnion * LocalInstantiationScope::findInstantiationOf(const Decl *D) { D = getCanonicalParmVarDecl(D); for (LocalInstantiationScope *Current = this; Current; Current = Current->Outer) { // Check if we found something within this scope. const Decl *CheckD = D; do { LocalDeclsMap::iterator Found = Current->LocalDecls.find(CheckD); if (Found != Current->LocalDecls.end()) return &Found->second; // If this is a tag declaration, it's possible that we need to look for // a previous declaration. if (const TagDecl *Tag = dyn_cast(CheckD)) CheckD = Tag->getPreviousDecl(); else CheckD = nullptr; } while (CheckD); // If we aren't combined with our outer scope, we're done. if (!Current->CombineWithOuterScope) break; } // If we're performing a partial substitution during template argument // deduction, we may not have values for template parameters yet. if (isa(D) || isa(D) || isa(D)) return nullptr; // Local types referenced prior to definition may require instantiation. if (const CXXRecordDecl *RD = dyn_cast(D)) if (RD->isLocalClass()) return nullptr; // Enumeration types referenced prior to definition may appear as a result of // error recovery. if (isa(D)) return nullptr; // Materialized typedefs/type alias for implicit deduction guides may require // instantiation. if (isa(D) && isa(D->getDeclContext())) return nullptr; // If we didn't find the decl, then we either have a sema bug, or we have a // forward reference to a label declaration. Return null to indicate that // we have an uninstantiated label. assert(isa(D) && "declaration not instantiated in this scope"); return nullptr; } void LocalInstantiationScope::InstantiatedLocal(const Decl *D, Decl *Inst) { D = getCanonicalParmVarDecl(D); llvm::PointerUnion &Stored = LocalDecls[D]; if (Stored.isNull()) { #ifndef NDEBUG // It should not be present in any surrounding scope either. LocalInstantiationScope *Current = this; while (Current->CombineWithOuterScope && Current->Outer) { Current = Current->Outer; assert(Current->LocalDecls.find(D) == Current->LocalDecls.end() && "Instantiated local in inner and outer scopes"); } #endif Stored = Inst; } else if (DeclArgumentPack *Pack = Stored.dyn_cast()) { Pack->push_back(cast(Inst)); } else { assert(Stored.get() == Inst && "Already instantiated this local"); } } void LocalInstantiationScope::InstantiatedLocalPackArg(const Decl *D, VarDecl *Inst) { D = getCanonicalParmVarDecl(D); DeclArgumentPack *Pack = LocalDecls[D].get(); Pack->push_back(Inst); } void LocalInstantiationScope::MakeInstantiatedLocalArgPack(const Decl *D) { #ifndef NDEBUG // This should be the first time we've been told about this decl. for (LocalInstantiationScope *Current = this; Current && Current->CombineWithOuterScope; Current = Current->Outer) assert(Current->LocalDecls.find(D) == Current->LocalDecls.end() && "Creating local pack after instantiation of local"); #endif D = getCanonicalParmVarDecl(D); llvm::PointerUnion &Stored = LocalDecls[D]; DeclArgumentPack *Pack = new DeclArgumentPack; Stored = Pack; ArgumentPacks.push_back(Pack); } bool LocalInstantiationScope::isLocalPackExpansion(const Decl *D) { for (DeclArgumentPack *Pack : ArgumentPacks) if (llvm::is_contained(*Pack, D)) return true; return false; } void LocalInstantiationScope::SetPartiallySubstitutedPack(NamedDecl *Pack, const TemplateArgument *ExplicitArgs, unsigned NumExplicitArgs) { assert((!PartiallySubstitutedPack || PartiallySubstitutedPack == Pack) && "Already have a partially-substituted pack"); assert((!PartiallySubstitutedPack || NumArgsInPartiallySubstitutedPack == NumExplicitArgs) && "Wrong number of arguments in partially-substituted pack"); PartiallySubstitutedPack = Pack; ArgsInPartiallySubstitutedPack = ExplicitArgs; NumArgsInPartiallySubstitutedPack = NumExplicitArgs; } NamedDecl *LocalInstantiationScope::getPartiallySubstitutedPack( const TemplateArgument **ExplicitArgs, unsigned *NumExplicitArgs) const { if (ExplicitArgs) *ExplicitArgs = nullptr; if (NumExplicitArgs) *NumExplicitArgs = 0; for (const LocalInstantiationScope *Current = this; Current; Current = Current->Outer) { if (Current->PartiallySubstitutedPack) { if (ExplicitArgs) *ExplicitArgs = Current->ArgsInPartiallySubstitutedPack; if (NumExplicitArgs) *NumExplicitArgs = Current->NumArgsInPartiallySubstitutedPack; return Current->PartiallySubstitutedPack; } if (!Current->CombineWithOuterScope) break; } return nullptr; } diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc index a08ec11e77a4..b46bd2e4d7a4 100644 --- a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc +++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc @@ -1,3819 +1,3873 @@ //===-- gen_std.py generated file -------------------------------*- C++ -*-===// // // Used to build a lookup table (qualified names => include headers) for CPP // Standard Library symbols. // // This file was generated automatically by // clang/tools/include-mapping/gen_std.py, DO NOT EDIT! // // Generated from cppreference offline HTML book (modified on 2022-07-30). //===----------------------------------------------------------------------===// SYMBOL(ATOMIC_BOOL_LOCK_FREE, None, ) SYMBOL(ATOMIC_CHAR16_T_LOCK_FREE, None, ) SYMBOL(ATOMIC_CHAR32_T_LOCK_FREE, None, ) SYMBOL(ATOMIC_CHAR8_T_LOCK_FREE, None, ) SYMBOL(ATOMIC_CHAR_LOCK_FREE, None, ) SYMBOL(ATOMIC_FLAG_INIT, None, ) SYMBOL(ATOMIC_INT_LOCK_FREE, None, ) SYMBOL(ATOMIC_LLONG_LOCK_FREE, None, ) SYMBOL(ATOMIC_LONG_LOCK_FREE, None, ) SYMBOL(ATOMIC_POINTER_LOCK_FREE, None, ) SYMBOL(ATOMIC_SHORT_LOCK_FREE, None, ) SYMBOL(ATOMIC_VAR_INIT, None, ) SYMBOL(ATOMIC_WCHAR_T_LOCK_FREE, None, ) SYMBOL(BUFSIZ, None, ) SYMBOL(BUFSIZ, None, ) SYMBOL(CHAR_BIT, None, ) SYMBOL(CHAR_BIT, None, ) SYMBOL(CHAR_MAX, None, ) SYMBOL(CHAR_MAX, None, ) SYMBOL(CHAR_MIN, None, ) SYMBOL(CHAR_MIN, None, ) SYMBOL(CLOCKS_PER_SEC, None, ) SYMBOL(CLOCKS_PER_SEC, None, ) SYMBOL(DBL_DECIMAL_DIG, None, ) SYMBOL(DBL_DECIMAL_DIG, None, ) SYMBOL(DBL_DIG, None, ) SYMBOL(DBL_DIG, None, ) SYMBOL(DBL_EPSILON, None, ) SYMBOL(DBL_EPSILON, None, ) SYMBOL(DBL_HAS_SUBNORM, None, ) SYMBOL(DBL_HAS_SUBNORM, None, ) SYMBOL(DBL_MANT_DIG, None, ) SYMBOL(DBL_MANT_DIG, None, ) SYMBOL(DBL_MAX, None, ) SYMBOL(DBL_MAX, None, ) SYMBOL(DBL_MAX_10_EXP, None, ) SYMBOL(DBL_MAX_10_EXP, None, ) SYMBOL(DBL_MAX_EXP, None, ) SYMBOL(DBL_MAX_EXP, None, ) SYMBOL(DBL_MIN, None, ) SYMBOL(DBL_MIN, None, ) SYMBOL(DBL_MIN_10_EXP, None, ) SYMBOL(DBL_MIN_10_EXP, None, ) SYMBOL(DBL_MIN_EXP, None, ) SYMBOL(DBL_MIN_EXP, None, ) SYMBOL(DBL_TRUE_MIN, None, ) SYMBOL(DBL_TRUE_MIN, None, ) SYMBOL(DECIMAL_DIG, None, ) SYMBOL(DECIMAL_DIG, None, ) SYMBOL(E2BIG, None, ) SYMBOL(E2BIG, None, ) SYMBOL(EACCES, None, ) SYMBOL(EACCES, None, ) SYMBOL(EADDRINUSE, None, ) SYMBOL(EADDRINUSE, None, ) SYMBOL(EADDRNOTAVAIL, None, ) SYMBOL(EADDRNOTAVAIL, None, ) SYMBOL(EAFNOSUPPORT, None, ) SYMBOL(EAFNOSUPPORT, None, ) SYMBOL(EAGAIN, None, ) SYMBOL(EAGAIN, None, ) SYMBOL(EALREADY, None, ) SYMBOL(EALREADY, None, ) SYMBOL(EBADF, None, ) SYMBOL(EBADF, None, ) SYMBOL(EBADMSG, None, ) SYMBOL(EBADMSG, None, ) SYMBOL(EBUSY, None, ) SYMBOL(EBUSY, None, ) SYMBOL(ECANCELED, None, ) SYMBOL(ECANCELED, None, ) SYMBOL(ECHILD, None, ) SYMBOL(ECHILD, None, ) SYMBOL(ECONNABORTED, None, ) SYMBOL(ECONNABORTED, None, ) SYMBOL(ECONNREFUSED, None, ) SYMBOL(ECONNREFUSED, None, ) SYMBOL(ECONNRESET, None, ) SYMBOL(ECONNRESET, None, ) SYMBOL(EDEADLK, None, ) SYMBOL(EDEADLK, None, ) SYMBOL(EDESTADDRREQ, None, ) SYMBOL(EDESTADDRREQ, None, ) SYMBOL(EDOM, None, ) SYMBOL(EDOM, None, ) SYMBOL(EEXIST, None, ) SYMBOL(EEXIST, None, ) SYMBOL(EFAULT, None, ) SYMBOL(EFAULT, None, ) SYMBOL(EFBIG, None, ) SYMBOL(EFBIG, None, ) SYMBOL(EHOSTUNREACH, None, ) SYMBOL(EHOSTUNREACH, None, ) SYMBOL(EIDRM, None, ) SYMBOL(EIDRM, None, ) SYMBOL(EILSEQ, None, ) SYMBOL(EILSEQ, None, ) SYMBOL(EINPROGRESS, None, ) SYMBOL(EINPROGRESS, None, ) SYMBOL(EINTR, None, ) SYMBOL(EINTR, None, ) SYMBOL(EINVAL, None, ) SYMBOL(EINVAL, None, ) SYMBOL(EIO, None, ) SYMBOL(EIO, None, ) SYMBOL(EISCONN, None, ) SYMBOL(EISCONN, None, ) SYMBOL(EISDIR, None, ) SYMBOL(EISDIR, None, ) SYMBOL(ELOOP, None, ) SYMBOL(ELOOP, None, ) SYMBOL(EMFILE, None, ) SYMBOL(EMFILE, None, ) SYMBOL(EMLINK, None, ) SYMBOL(EMLINK, None, ) SYMBOL(EMSGSIZE, None, ) SYMBOL(EMSGSIZE, None, ) SYMBOL(ENAMETOOLONG, None, ) SYMBOL(ENAMETOOLONG, None, ) SYMBOL(ENETDOWN, None, ) SYMBOL(ENETDOWN, None, ) SYMBOL(ENETRESET, None, ) SYMBOL(ENETRESET, None, ) SYMBOL(ENETUNREACH, None, ) SYMBOL(ENETUNREACH, None, ) SYMBOL(ENFILE, None, ) SYMBOL(ENFILE, None, ) SYMBOL(ENOBUFS, None, ) SYMBOL(ENOBUFS, None, ) SYMBOL(ENODATA, None, ) SYMBOL(ENODATA, None, ) SYMBOL(ENODEV, None, ) SYMBOL(ENODEV, None, ) SYMBOL(ENOENT, None, ) SYMBOL(ENOENT, None, ) SYMBOL(ENOEXEC, None, ) SYMBOL(ENOEXEC, None, ) SYMBOL(ENOLCK, None, ) SYMBOL(ENOLCK, None, ) SYMBOL(ENOLINK, None, ) SYMBOL(ENOLINK, None, ) SYMBOL(ENOMEM, None, ) SYMBOL(ENOMEM, None, ) SYMBOL(ENOMSG, None, ) SYMBOL(ENOMSG, None, ) SYMBOL(ENOPROTOOPT, None, ) SYMBOL(ENOPROTOOPT, None, ) SYMBOL(ENOSPC, None, ) SYMBOL(ENOSPC, None, ) SYMBOL(ENOSR, None, ) SYMBOL(ENOSR, None, ) SYMBOL(ENOSTR, None, ) SYMBOL(ENOSTR, None, ) SYMBOL(ENOSYS, None, ) SYMBOL(ENOSYS, None, ) SYMBOL(ENOTCONN, None, ) SYMBOL(ENOTCONN, None, ) SYMBOL(ENOTDIR, None, ) SYMBOL(ENOTDIR, None, ) SYMBOL(ENOTEMPTY, None, ) SYMBOL(ENOTEMPTY, None, ) SYMBOL(ENOTRECOVERABLE, None, ) SYMBOL(ENOTRECOVERABLE, None, ) SYMBOL(ENOTSOCK, None, ) SYMBOL(ENOTSOCK, None, ) SYMBOL(ENOTSUP, None, ) SYMBOL(ENOTSUP, None, ) SYMBOL(ENOTTY, None, ) SYMBOL(ENOTTY, None, ) SYMBOL(ENXIO, None, ) SYMBOL(ENXIO, None, ) SYMBOL(EOF, None, ) SYMBOL(EOF, None, ) SYMBOL(EOPNOTSUPP, None, ) SYMBOL(EOPNOTSUPP, None, ) SYMBOL(EOVERFLOW, None, ) SYMBOL(EOVERFLOW, None, ) SYMBOL(EOWNERDEAD, None, ) SYMBOL(EOWNERDEAD, None, ) SYMBOL(EPERM, None, ) SYMBOL(EPERM, None, ) SYMBOL(EPIPE, None, ) SYMBOL(EPIPE, None, ) SYMBOL(EPROTO, None, ) SYMBOL(EPROTO, None, ) SYMBOL(EPROTONOSUPPORT, None, ) SYMBOL(EPROTONOSUPPORT, None, ) SYMBOL(EPROTOTYPE, None, ) SYMBOL(EPROTOTYPE, None, ) SYMBOL(ERANGE, None, ) SYMBOL(ERANGE, None, ) SYMBOL(EROFS, None, ) SYMBOL(EROFS, None, ) SYMBOL(ESPIPE, None, ) SYMBOL(ESPIPE, None, ) SYMBOL(ESRCH, None, ) SYMBOL(ESRCH, None, ) SYMBOL(ETIME, None, ) SYMBOL(ETIME, None, ) SYMBOL(ETIMEDOUT, None, ) SYMBOL(ETIMEDOUT, None, ) SYMBOL(ETXTBSY, None, ) SYMBOL(ETXTBSY, None, ) SYMBOL(EWOULDBLOCK, None, ) SYMBOL(EWOULDBLOCK, None, ) SYMBOL(EXDEV, None, ) SYMBOL(EXDEV, None, ) SYMBOL(EXIT_FAILURE, None, ) SYMBOL(EXIT_FAILURE, None, ) SYMBOL(EXIT_SUCCESS, None, ) SYMBOL(EXIT_SUCCESS, None, ) SYMBOL(FE_ALL_EXCEPT, None, ) SYMBOL(FE_ALL_EXCEPT, None, ) SYMBOL(FE_DFL_ENV, None, ) SYMBOL(FE_DFL_ENV, None, ) SYMBOL(FE_DIVBYZERO, None, ) SYMBOL(FE_DIVBYZERO, None, ) SYMBOL(FE_DOWNWARD, None, ) SYMBOL(FE_DOWNWARD, None, ) SYMBOL(FE_INEXACT, None, ) SYMBOL(FE_INEXACT, None, ) SYMBOL(FE_INVALID, None, ) SYMBOL(FE_INVALID, None, ) SYMBOL(FE_OVERFLOW, None, ) SYMBOL(FE_OVERFLOW, None, ) SYMBOL(FE_TONEAREST, None, ) SYMBOL(FE_TONEAREST, None, ) SYMBOL(FE_TOWARDZERO, None, ) SYMBOL(FE_TOWARDZERO, None, ) SYMBOL(FE_UNDERFLOW, None, ) SYMBOL(FE_UNDERFLOW, None, ) SYMBOL(FE_UPWARD, None, ) SYMBOL(FE_UPWARD, None, ) SYMBOL(FILENAME_MAX, None, ) SYMBOL(FILENAME_MAX, None, ) SYMBOL(FLT_DECIMAL_DIG, None, ) SYMBOL(FLT_DECIMAL_DIG, None, ) SYMBOL(FLT_DIG, None, ) SYMBOL(FLT_DIG, None, ) SYMBOL(FLT_EPSILON, None, ) SYMBOL(FLT_EPSILON, None, ) SYMBOL(FLT_EVAL_METHOD, None, ) SYMBOL(FLT_EVAL_METHOD, None, ) SYMBOL(FLT_HAS_SUBNORM, None, ) SYMBOL(FLT_HAS_SUBNORM, None, ) SYMBOL(FLT_MANT_DIG, None, ) SYMBOL(FLT_MANT_DIG, None, ) SYMBOL(FLT_MAX, None, ) SYMBOL(FLT_MAX, None, ) SYMBOL(FLT_MAX_10_EXP, None, ) SYMBOL(FLT_MAX_10_EXP, None, ) SYMBOL(FLT_MAX_EXP, None, ) SYMBOL(FLT_MAX_EXP, None, ) SYMBOL(FLT_MIN, None, ) SYMBOL(FLT_MIN, None, ) SYMBOL(FLT_MIN_10_EXP, None, ) SYMBOL(FLT_MIN_10_EXP, None, ) SYMBOL(FLT_MIN_EXP, None, ) SYMBOL(FLT_MIN_EXP, None, ) SYMBOL(FLT_RADIX, None, ) SYMBOL(FLT_RADIX, None, ) SYMBOL(FLT_ROUNDS, None, ) SYMBOL(FLT_ROUNDS, None, ) SYMBOL(FLT_TRUE_MIN, None, ) SYMBOL(FLT_TRUE_MIN, None, ) SYMBOL(FOPEN_MAX, None, ) SYMBOL(FOPEN_MAX, None, ) SYMBOL(FP_FAST_FMA, None, ) SYMBOL(FP_FAST_FMA, None, ) SYMBOL(FP_FAST_FMAF, None, ) SYMBOL(FP_FAST_FMAF, None, ) SYMBOL(FP_FAST_FMAL, None, ) SYMBOL(FP_FAST_FMAL, None, ) SYMBOL(FP_ILOGB0, None, ) SYMBOL(FP_ILOGB0, None, ) SYMBOL(FP_ILOGBNAN, None, ) SYMBOL(FP_ILOGBNAN, None, ) SYMBOL(FP_INFINITE, None, ) SYMBOL(FP_INFINITE, None, ) SYMBOL(FP_NAN, None, ) SYMBOL(FP_NAN, None, ) SYMBOL(FP_NORMAL, None, ) SYMBOL(FP_NORMAL, None, ) SYMBOL(FP_SUBNORMAL, None, ) SYMBOL(FP_SUBNORMAL, None, ) SYMBOL(FP_ZERO, None, ) SYMBOL(FP_ZERO, None, ) SYMBOL(HUGE_VAL, None, ) SYMBOL(HUGE_VAL, None, ) SYMBOL(HUGE_VALF, None, ) SYMBOL(HUGE_VALF, None, ) SYMBOL(HUGE_VALL, None, ) SYMBOL(HUGE_VALL, None, ) SYMBOL(INFINITY, None, ) SYMBOL(INFINITY, None, ) SYMBOL(INT16_MAX, None, ) SYMBOL(INT16_MAX, None, ) SYMBOL(INT16_MIN, None, ) SYMBOL(INT16_MIN, None, ) SYMBOL(INT32_MAX, None, ) SYMBOL(INT32_MAX, None, ) SYMBOL(INT32_MIN, None, ) SYMBOL(INT32_MIN, None, ) SYMBOL(INT64_MAX, None, ) SYMBOL(INT64_MAX, None, ) SYMBOL(INT64_MIN, None, ) SYMBOL(INT64_MIN, None, ) SYMBOL(INT8_MAX, None, ) SYMBOL(INT8_MAX, None, ) SYMBOL(INT8_MIN, None, ) SYMBOL(INT8_MIN, None, ) SYMBOL(INTMAX_MAX, None, ) SYMBOL(INTMAX_MAX, None, ) SYMBOL(INTMAX_MIN, None, ) SYMBOL(INTMAX_MIN, None, ) SYMBOL(INTPTR_MAX, None, ) SYMBOL(INTPTR_MAX, None, ) SYMBOL(INTPTR_MIN, None, ) SYMBOL(INTPTR_MIN, None, ) SYMBOL(INT_FAST16_MAX, None, ) SYMBOL(INT_FAST16_MAX, None, ) SYMBOL(INT_FAST16_MIN, None, ) SYMBOL(INT_FAST16_MIN, None, ) SYMBOL(INT_FAST32_MAX, None, ) SYMBOL(INT_FAST32_MAX, None, ) SYMBOL(INT_FAST32_MIN, None, ) SYMBOL(INT_FAST32_MIN, None, ) SYMBOL(INT_FAST64_MAX, None, ) SYMBOL(INT_FAST64_MAX, None, ) SYMBOL(INT_FAST64_MIN, None, ) SYMBOL(INT_FAST64_MIN, None, ) SYMBOL(INT_FAST8_MAX, None, ) SYMBOL(INT_FAST8_MAX, None, ) SYMBOL(INT_FAST8_MIN, None, ) SYMBOL(INT_FAST8_MIN, None, ) SYMBOL(INT_LEAST16_MAX, None, ) SYMBOL(INT_LEAST16_MAX, None, ) SYMBOL(INT_LEAST16_MIN, None, ) SYMBOL(INT_LEAST16_MIN, None, ) SYMBOL(INT_LEAST32_MAX, None, ) SYMBOL(INT_LEAST32_MAX, None, ) SYMBOL(INT_LEAST32_MIN, None, ) SYMBOL(INT_LEAST32_MIN, None, ) SYMBOL(INT_LEAST64_MAX, None, ) SYMBOL(INT_LEAST64_MAX, None, ) SYMBOL(INT_LEAST64_MIN, None, ) SYMBOL(INT_LEAST64_MIN, None, ) SYMBOL(INT_LEAST8_MAX, None, ) SYMBOL(INT_LEAST8_MAX, None, ) SYMBOL(INT_LEAST8_MIN, None, ) SYMBOL(INT_LEAST8_MIN, None, ) SYMBOL(INT_MAX, None, ) SYMBOL(INT_MAX, None, ) SYMBOL(INT_MIN, None, ) SYMBOL(INT_MIN, None, ) SYMBOL(LC_ALL, None, ) SYMBOL(LC_ALL, None, ) SYMBOL(LC_COLLATE, None, ) SYMBOL(LC_COLLATE, None, ) SYMBOL(LC_CTYPE, None, ) SYMBOL(LC_CTYPE, None, ) SYMBOL(LC_MONETARY, None, ) SYMBOL(LC_MONETARY, None, ) SYMBOL(LC_NUMERIC, None, ) SYMBOL(LC_NUMERIC, None, ) SYMBOL(LC_TIME, None, ) SYMBOL(LC_TIME, None, ) SYMBOL(LDBL_DECIMAL_DIG, None, ) SYMBOL(LDBL_DECIMAL_DIG, None, ) SYMBOL(LDBL_DIG, None, ) SYMBOL(LDBL_DIG, None, ) SYMBOL(LDBL_EPSILON, None, ) SYMBOL(LDBL_EPSILON, None, ) SYMBOL(LDBL_HAS_SUBNORM, None, ) SYMBOL(LDBL_HAS_SUBNORM, None, ) SYMBOL(LDBL_MANT_DIG, None, ) SYMBOL(LDBL_MANT_DIG, None, ) SYMBOL(LDBL_MAX, None, ) SYMBOL(LDBL_MAX, None, ) SYMBOL(LDBL_MAX_10_EXP, None, ) SYMBOL(LDBL_MAX_10_EXP, None, ) SYMBOL(LDBL_MAX_EXP, None, ) SYMBOL(LDBL_MAX_EXP, None, ) SYMBOL(LDBL_MIN, None, ) SYMBOL(LDBL_MIN, None, ) SYMBOL(LDBL_MIN_10_EXP, None, ) SYMBOL(LDBL_MIN_10_EXP, None, ) SYMBOL(LDBL_MIN_EXP, None, ) SYMBOL(LDBL_MIN_EXP, None, ) SYMBOL(LDBL_TRUE_MIN, None, ) SYMBOL(LDBL_TRUE_MIN, None, ) SYMBOL(LLONG_MAX, None, ) SYMBOL(LLONG_MAX, None, ) SYMBOL(LLONG_MIN, None, ) SYMBOL(LLONG_MIN, None, ) SYMBOL(LONG_MAX, None, ) SYMBOL(LONG_MAX, None, ) SYMBOL(LONG_MIN, None, ) SYMBOL(LONG_MIN, None, ) SYMBOL(L_tmpnam, None, ) SYMBOL(L_tmpnam, None, ) SYMBOL(MATH_ERREXCEPT, None, ) SYMBOL(MATH_ERREXCEPT, None, ) SYMBOL(MATH_ERRNO, None, ) SYMBOL(MATH_ERRNO, None, ) SYMBOL(MB_CUR_MAX, None, ) SYMBOL(MB_CUR_MAX, None, ) SYMBOL(MB_LEN_MAX, None, ) SYMBOL(MB_LEN_MAX, None, ) SYMBOL(NAN, None, ) SYMBOL(NAN, None, ) SYMBOL(ONCE_FLAG_INIT, None, ) SYMBOL(PTRDIFF_MAX, None, ) SYMBOL(PTRDIFF_MAX, None, ) SYMBOL(PTRDIFF_MIN, None, ) SYMBOL(PTRDIFF_MIN, None, ) SYMBOL(RAND_MAX, None, ) SYMBOL(RAND_MAX, None, ) SYMBOL(SCHAR_MAX, None, ) SYMBOL(SCHAR_MAX, None, ) SYMBOL(SCHAR_MIN, None, ) SYMBOL(SCHAR_MIN, None, ) SYMBOL(SEEK_CUR, None, ) SYMBOL(SEEK_CUR, None, ) SYMBOL(SEEK_END, None, ) SYMBOL(SEEK_END, None, ) SYMBOL(SEEK_SET, None, ) SYMBOL(SEEK_SET, None, ) SYMBOL(SHRT_MAX, None, ) SYMBOL(SHRT_MAX, None, ) SYMBOL(SHRT_MIN, None, ) SYMBOL(SHRT_MIN, None, ) SYMBOL(SIGABRT, None, ) SYMBOL(SIGABRT, None, ) SYMBOL(SIGFPE, None, ) SYMBOL(SIGFPE, None, ) SYMBOL(SIGILL, None, ) SYMBOL(SIGILL, None, ) SYMBOL(SIGINT, None, ) SYMBOL(SIGINT, None, ) SYMBOL(SIGSEGV, None, ) SYMBOL(SIGSEGV, None, ) SYMBOL(SIGTERM, None, ) SYMBOL(SIGTERM, None, ) SYMBOL(SIG_ATOMIC_MAX, None, ) SYMBOL(SIG_ATOMIC_MAX, None, ) SYMBOL(SIG_ATOMIC_MIN, None, ) SYMBOL(SIG_ATOMIC_MIN, None, ) SYMBOL(SIG_DFL, None, ) SYMBOL(SIG_DFL, None, ) SYMBOL(SIG_ERR, None, ) SYMBOL(SIG_ERR, None, ) SYMBOL(SIG_IGN, None, ) SYMBOL(SIG_IGN, None, ) SYMBOL(SIZE_MAX, None, ) SYMBOL(SIZE_MAX, None, ) SYMBOL(TIME_UTC, None, ) SYMBOL(TIME_UTC, None, ) SYMBOL(TMP_MAX, None, ) SYMBOL(TMP_MAX, None, ) SYMBOL(UCHAR_MAX, None, ) SYMBOL(UCHAR_MAX, None, ) SYMBOL(UINT16_MAX, None, ) SYMBOL(UINT16_MAX, None, ) SYMBOL(UINT32_MAX, None, ) SYMBOL(UINT32_MAX, None, ) SYMBOL(UINT64_MAX, None, ) SYMBOL(UINT64_MAX, None, ) SYMBOL(UINT8_MAX, None, ) SYMBOL(UINT8_MAX, None, ) SYMBOL(UINTMAX_MAX, None, ) SYMBOL(UINTMAX_MAX, None, ) SYMBOL(UINTPTR_MAX, None, ) SYMBOL(UINTPTR_MAX, None, ) SYMBOL(UINT_FAST16_MAX, None, ) SYMBOL(UINT_FAST16_MAX, None, ) SYMBOL(UINT_FAST32_MAX, None, ) SYMBOL(UINT_FAST32_MAX, None, ) SYMBOL(UINT_FAST64_MAX, None, ) SYMBOL(UINT_FAST64_MAX, None, ) SYMBOL(UINT_FAST8_MAX, None, ) SYMBOL(UINT_FAST8_MAX, None, ) SYMBOL(UINT_LEAST16_MAX, None, ) SYMBOL(UINT_LEAST16_MAX, None, ) SYMBOL(UINT_LEAST32_MAX, None, ) SYMBOL(UINT_LEAST32_MAX, None, ) SYMBOL(UINT_LEAST64_MAX, None, ) SYMBOL(UINT_LEAST64_MAX, None, ) SYMBOL(UINT_LEAST8_MAX, None, ) SYMBOL(UINT_LEAST8_MAX, None, ) SYMBOL(UINT_MAX, None, ) SYMBOL(UINT_MAX, None, ) SYMBOL(ULLONG_MAX, None, ) SYMBOL(ULLONG_MAX, None, ) SYMBOL(ULONG_MAX, None, ) SYMBOL(ULONG_MAX, None, ) SYMBOL(USHRT_MAX, None, ) SYMBOL(USHRT_MAX, None, ) SYMBOL(WEOF, None, ) SYMBOL(WEOF, None, ) SYMBOL(WINT_MAX, None, ) SYMBOL(WINT_MAX, None, ) SYMBOL(WINT_MIN, None, ) SYMBOL(WINT_MIN, None, ) SYMBOL(_IOFBF, None, ) SYMBOL(_IOFBF, None, ) SYMBOL(_IOLBF, None, ) SYMBOL(_IOLBF, None, ) SYMBOL(_IONBF, None, ) SYMBOL(_IONBF, None, ) SYMBOL(assert, None, ) SYMBOL(assert, None, ) SYMBOL(errno, None, ) SYMBOL(errno, None, ) SYMBOL(math_errhandling, None, ) SYMBOL(math_errhandling, None, ) SYMBOL(offsetof, None, ) SYMBOL(offsetof, None, ) SYMBOL(setjmp, None, ) SYMBOL(setjmp, None, ) SYMBOL(stderr, None, ) SYMBOL(stderr, None, ) SYMBOL(stdin, None, ) SYMBOL(stdin, None, ) SYMBOL(stdout, None, ) SYMBOL(stdout, None, ) SYMBOL(va_arg, None, ) SYMBOL(va_arg, None, ) SYMBOL(va_copy, None, ) SYMBOL(va_copy, None, ) SYMBOL(va_end, None, ) SYMBOL(va_end, None, ) SYMBOL(va_start, None, ) SYMBOL(va_start, None, ) SYMBOL(FILE, std::, ) SYMBOL(FILE, None, ) SYMBOL(FILE, None, ) SYMBOL(_Exit, std::, ) SYMBOL(_Exit, None, ) SYMBOL(_Exit, None, ) SYMBOL(accumulate, std::, ) SYMBOL(acos, std::, ) SYMBOL(acos, None, ) SYMBOL(acos, None, ) SYMBOL(acosf, std::, ) SYMBOL(acosf, None, ) SYMBOL(acosf, None, ) SYMBOL(acosh, std::, ) SYMBOL(acosh, None, ) SYMBOL(acosh, None, ) SYMBOL(acoshf, std::, ) SYMBOL(acoshf, None, ) SYMBOL(acoshf, None, ) SYMBOL(acoshl, std::, ) SYMBOL(acoshl, None, ) SYMBOL(acoshl, None, ) SYMBOL(acosl, std::, ) SYMBOL(acosl, None, ) SYMBOL(acosl, None, ) SYMBOL(add_const, std::, ) SYMBOL(add_const_t, std::, ) SYMBOL(add_cv, std::, ) SYMBOL(add_cv_t, std::, ) SYMBOL(add_lvalue_reference, std::, ) SYMBOL(add_lvalue_reference_t, std::, ) SYMBOL(add_pointer, std::, ) SYMBOL(add_pointer_t, std::, ) SYMBOL(add_rvalue_reference, std::, ) SYMBOL(add_rvalue_reference_t, std::, ) SYMBOL(add_volatile, std::, ) SYMBOL(add_volatile_t, std::, ) SYMBOL(addressof, std::, ) SYMBOL(adjacent_difference, std::, ) SYMBOL(adjacent_find, std::, ) SYMBOL(adopt_lock, std::, ) SYMBOL(adopt_lock_t, std::, ) SYMBOL(advance, std::, ) SYMBOL(align, std::, ) SYMBOL(align_val_t, std::, ) SYMBOL(aligned_alloc, std::, ) SYMBOL(aligned_alloc, None, ) SYMBOL(aligned_alloc, None, ) SYMBOL(aligned_storage, std::, ) SYMBOL(aligned_storage_t, std::, ) SYMBOL(aligned_union, std::, ) SYMBOL(aligned_union_t, std::, ) SYMBOL(alignment_of, std::, ) SYMBOL(alignment_of_v, std::, ) SYMBOL(all_of, std::, ) SYMBOL(allocate_at_least, std::, ) SYMBOL(allocate_shared, std::, ) SYMBOL(allocate_shared_for_overwrite, std::, ) SYMBOL(allocation_result, std::, ) SYMBOL(allocator, std::, ) SYMBOL(allocator_arg, std::, ) SYMBOL(allocator_arg_t, std::, ) SYMBOL(allocator_traits, std::, ) SYMBOL(any, std::, ) SYMBOL(any_of, std::, ) SYMBOL(apply, std::, ) SYMBOL(arg, std::, ) SYMBOL(array, std::, ) SYMBOL(as_bytes, std::, ) SYMBOL(as_const, std::, ) SYMBOL(as_writable_bytes, std::, ) SYMBOL(asctime, std::, ) SYMBOL(asctime, None, ) SYMBOL(asctime, None, ) SYMBOL(asin, std::, ) SYMBOL(asin, None, ) SYMBOL(asin, None, ) SYMBOL(asinf, std::, ) SYMBOL(asinf, None, ) SYMBOL(asinf, None, ) SYMBOL(asinh, std::, ) SYMBOL(asinh, None, ) SYMBOL(asinh, None, ) SYMBOL(asinhf, std::, ) SYMBOL(asinhf, None, ) SYMBOL(asinhf, None, ) SYMBOL(asinhl, std::, ) SYMBOL(asinhl, None, ) SYMBOL(asinhl, None, ) SYMBOL(asinl, std::, ) SYMBOL(asinl, None, ) SYMBOL(asinl, None, ) SYMBOL(assignable_from, std::, ) SYMBOL(assoc_laguerre, std::, ) SYMBOL(assoc_laguerref, std::, ) SYMBOL(assoc_laguerrel, std::, ) SYMBOL(assoc_legendre, std::, ) SYMBOL(assoc_legendref, std::, ) SYMBOL(assoc_legendrel, std::, ) SYMBOL(assume_aligned, std::, ) SYMBOL(async, std::, ) SYMBOL(at_quick_exit, std::, ) SYMBOL(at_quick_exit, None, ) SYMBOL(at_quick_exit, None, ) SYMBOL(atan, std::, ) SYMBOL(atan, None, ) SYMBOL(atan, None, ) SYMBOL(atan2, std::, ) SYMBOL(atan2, None, ) SYMBOL(atan2, None, ) SYMBOL(atan2f, std::, ) SYMBOL(atan2f, None, ) SYMBOL(atan2f, None, ) SYMBOL(atan2l, std::, ) SYMBOL(atan2l, None, ) SYMBOL(atan2l, None, ) SYMBOL(atanf, std::, ) SYMBOL(atanf, None, ) SYMBOL(atanf, None, ) SYMBOL(atanh, std::, ) SYMBOL(atanh, None, ) SYMBOL(atanh, None, ) SYMBOL(atanhf, std::, ) SYMBOL(atanhf, None, ) SYMBOL(atanhf, None, ) SYMBOL(atanhl, std::, ) SYMBOL(atanhl, None, ) SYMBOL(atanhl, None, ) SYMBOL(atanl, std::, ) SYMBOL(atanl, None, ) SYMBOL(atanl, None, ) SYMBOL(atexit, std::, ) SYMBOL(atexit, None, ) SYMBOL(atexit, None, ) SYMBOL(atof, std::, ) SYMBOL(atof, None, ) SYMBOL(atof, None, ) SYMBOL(atoi, std::, ) SYMBOL(atoi, None, ) SYMBOL(atoi, None, ) SYMBOL(atol, std::, ) SYMBOL(atol, None, ) SYMBOL(atol, None, ) SYMBOL(atoll, std::, ) SYMBOL(atoll, None, ) SYMBOL(atoll, None, ) SYMBOL(atomic_compare_exchange_strong, std::, ) SYMBOL(atomic_compare_exchange_strong_explicit, std::, ) SYMBOL(atomic_compare_exchange_weak, std::, ) SYMBOL(atomic_compare_exchange_weak_explicit, std::, ) SYMBOL(atomic_exchange, std::, ) SYMBOL(atomic_exchange_explicit, std::, ) SYMBOL(atomic_fetch_add, std::, ) SYMBOL(atomic_fetch_add_explicit, std::, ) SYMBOL(atomic_fetch_and, std::, ) SYMBOL(atomic_fetch_and_explicit, std::, ) SYMBOL(atomic_fetch_or, std::, ) SYMBOL(atomic_fetch_or_explicit, std::, ) SYMBOL(atomic_fetch_sub, std::, ) SYMBOL(atomic_fetch_sub_explicit, std::, ) SYMBOL(atomic_fetch_xor, std::, ) SYMBOL(atomic_fetch_xor_explicit, std::, ) SYMBOL(atomic_flag, std::, ) SYMBOL(atomic_flag_clear, std::, ) SYMBOL(atomic_flag_clear_explicit, std::, ) SYMBOL(atomic_flag_notify_all, std::, ) SYMBOL(atomic_flag_notify_one, std::, ) SYMBOL(atomic_flag_test, std::, ) SYMBOL(atomic_flag_test_and_set, std::, ) SYMBOL(atomic_flag_test_and_set_explicit, std::, ) SYMBOL(atomic_flag_test_explicit, std::, ) SYMBOL(atomic_flag_wait, std::, ) SYMBOL(atomic_flag_wait_explicit, std::, ) SYMBOL(atomic_init, std::, ) SYMBOL(atomic_is_lock_free, std::, ) SYMBOL(atomic_load, std::, ) SYMBOL(atomic_load_explicit, std::, ) SYMBOL(atomic_notify_all, std::, ) SYMBOL(atomic_notify_one, std::, ) SYMBOL(atomic_ref, std::, ) SYMBOL(atomic_signal_fence, std::, ) SYMBOL(atomic_store, std::, ) SYMBOL(atomic_store_explicit, std::, ) SYMBOL(atomic_thread_fence, std::, ) SYMBOL(atomic_wait, std::, ) SYMBOL(atomic_wait_explicit, std::, ) SYMBOL(atto, std::, ) SYMBOL(auto_ptr, std::, ) SYMBOL(back_insert_iterator, std::, ) SYMBOL(back_inserter, std::, ) SYMBOL(bad_alloc, std::, ) SYMBOL(bad_any_cast, std::, ) SYMBOL(bad_array_new_length, std::, ) SYMBOL(bad_cast, std::, ) SYMBOL(bad_exception, std::, ) SYMBOL(bad_function_call, std::, ) SYMBOL(bad_optional_access, std::, ) SYMBOL(bad_typeid, std::, ) SYMBOL(bad_variant_access, std::, ) SYMBOL(bad_weak_ptr, std::, ) SYMBOL(barrier, std::, ) SYMBOL(basic_common_reference, std::, ) SYMBOL(basic_filebuf, std::, ) SYMBOL(basic_filebuf, std::, ) SYMBOL(basic_format_arg, std::, ) SYMBOL(basic_format_args, std::, ) SYMBOL(basic_format_context, std::, ) SYMBOL(basic_format_parse_context, std::, ) SYMBOL(basic_fstream, std::, ) SYMBOL(basic_fstream, std::, ) SYMBOL(basic_ifstream, std::, ) SYMBOL(basic_ifstream, std::, ) SYMBOL(basic_ios, std::, ) SYMBOL(basic_ios, std::, ) SYMBOL(basic_ios, std::, ) SYMBOL(basic_iostream, std::, ) SYMBOL(basic_iostream, std::, ) SYMBOL(basic_iostream, std::, ) SYMBOL(basic_ispanstream, std::, ) SYMBOL(basic_ispanstream, std::, ) SYMBOL(basic_istream, std::, ) SYMBOL(basic_istream, std::, ) SYMBOL(basic_istream, std::, ) SYMBOL(basic_istringstream, std::, ) SYMBOL(basic_istringstream, std::, ) SYMBOL(basic_ofstream, std::, ) SYMBOL(basic_ofstream, std::, ) SYMBOL(basic_ospanstream, std::, ) SYMBOL(basic_ospanstream, std::, ) SYMBOL(basic_ostream, std::, ) SYMBOL(basic_ostream, std::, ) SYMBOL(basic_ostream, std::, ) SYMBOL(basic_ostringstream, std::, ) SYMBOL(basic_ostringstream, std::, ) SYMBOL(basic_osyncstream, std::, ) SYMBOL(basic_osyncstream, std::, ) SYMBOL(basic_regex, std::, ) SYMBOL(basic_spanbuf, std::, ) SYMBOL(basic_spanbuf, std::, ) SYMBOL(basic_spanstream, std::, ) SYMBOL(basic_spanstream, std::, ) SYMBOL(basic_stacktrace, std::, ) SYMBOL(basic_streambuf, std::, ) SYMBOL(basic_streambuf, std::, ) SYMBOL(basic_streambuf, std::, ) SYMBOL(basic_string, std::, ) SYMBOL(basic_string_view, std::, ) SYMBOL(basic_stringbuf, std::, ) SYMBOL(basic_stringbuf, std::, ) SYMBOL(basic_stringstream, std::, ) SYMBOL(basic_stringstream, std::, ) SYMBOL(basic_syncbuf, std::, ) SYMBOL(basic_syncbuf, std::, ) SYMBOL(bernoulli_distribution, std::, ) SYMBOL(beta, std::, ) SYMBOL(betaf, std::, ) SYMBOL(betal, std::, ) SYMBOL(bidirectional_iterator, std::, ) SYMBOL(bidirectional_iterator_tag, std::, ) SYMBOL(binary_function, std::, ) SYMBOL(binary_negate, std::, ) SYMBOL(binary_search, std::, ) SYMBOL(binary_semaphore, std::, ) SYMBOL(bind, std::, ) SYMBOL(bind1st, std::, ) SYMBOL(bind2nd, std::, ) SYMBOL(bind_back, std::, ) SYMBOL(bind_front, std::, ) SYMBOL(binder1st, std::, ) SYMBOL(binder2nd, std::, ) SYMBOL(binomial_distribution, std::, ) SYMBOL(bit_and, std::, ) SYMBOL(bit_cast, std::, ) SYMBOL(bit_ceil, std::, ) SYMBOL(bit_floor, std::, ) SYMBOL(bit_not, std::, ) SYMBOL(bit_or, std::, ) SYMBOL(bit_width, std::, ) SYMBOL(bit_xor, std::, ) SYMBOL(bitset, std::, ) SYMBOL(bool_constant, std::, ) SYMBOL(boolalpha, std::, ) SYMBOL(boolalpha, std::, ) SYMBOL(boyer_moore_horspool_searcher, std::, ) SYMBOL(boyer_moore_searcher, std::, ) SYMBOL(bsearch, std::, ) SYMBOL(bsearch, None, ) SYMBOL(bsearch, None, ) SYMBOL(btowc, std::, ) SYMBOL(btowc, None, ) SYMBOL(btowc, None, ) SYMBOL(byte, std::, ) SYMBOL(byteswap, std::, ) SYMBOL(c16rtomb, std::, ) SYMBOL(c16rtomb, None, ) SYMBOL(c16rtomb, None, ) SYMBOL(c32rtomb, std::, ) SYMBOL(c32rtomb, None, ) SYMBOL(c32rtomb, None, ) SYMBOL(c8rtomb, std::, ) SYMBOL(c8rtomb, None, ) SYMBOL(c8rtomb, None, ) SYMBOL(call_once, std::, ) SYMBOL(calloc, std::, ) SYMBOL(calloc, None, ) SYMBOL(calloc, None, ) SYMBOL(cauchy_distribution, std::, ) SYMBOL(cbrt, std::, ) SYMBOL(cbrt, None, ) SYMBOL(cbrt, None, ) SYMBOL(cbrtf, std::, ) SYMBOL(cbrtf, None, ) SYMBOL(cbrtf, None, ) SYMBOL(cbrtl, std::, ) SYMBOL(cbrtl, None, ) SYMBOL(cbrtl, None, ) SYMBOL(ceil, std::, ) SYMBOL(ceil, None, ) SYMBOL(ceil, None, ) SYMBOL(ceilf, std::, ) SYMBOL(ceilf, None, ) SYMBOL(ceilf, None, ) SYMBOL(ceill, std::, ) SYMBOL(ceill, None, ) SYMBOL(ceill, None, ) SYMBOL(centi, std::, ) SYMBOL(cerr, std::, ) SYMBOL(char_traits, std::, ) SYMBOL(chars_format, std::, ) SYMBOL(chi_squared_distribution, std::, ) SYMBOL(cin, std::, ) SYMBOL(clamp, std::, ) SYMBOL(clearerr, std::, ) SYMBOL(clearerr, None, ) SYMBOL(clearerr, None, ) SYMBOL(clock, std::, ) SYMBOL(clock, None, ) SYMBOL(clock, None, ) SYMBOL(clock_t, std::, ) SYMBOL(clock_t, None, ) SYMBOL(clock_t, None, ) SYMBOL(clog, std::, ) SYMBOL(cmatch, std::, ) SYMBOL(cmp_equal, std::, ) SYMBOL(cmp_greater, std::, ) SYMBOL(cmp_greater_equal, std::, ) SYMBOL(cmp_less, std::, ) SYMBOL(cmp_less_equal, std::, ) SYMBOL(cmp_not_equal, std::, ) SYMBOL(codecvt, std::, ) SYMBOL(codecvt_base, std::, ) SYMBOL(codecvt_byname, std::, ) SYMBOL(codecvt_mode, std::, ) SYMBOL(codecvt_utf16, std::, ) SYMBOL(codecvt_utf8, std::, ) SYMBOL(codecvt_utf8_utf16, std::, ) SYMBOL(collate, std::, ) SYMBOL(collate_byname, std::, ) SYMBOL(common_comparison_category, std::, ) SYMBOL(common_comparison_category_t, std::, ) SYMBOL(common_iterator, std::, ) SYMBOL(common_reference, std::, ) SYMBOL(common_reference_t, std::, ) SYMBOL(common_reference_with, std::, ) SYMBOL(common_type, std::, ) SYMBOL(common_type_t, std::, ) SYMBOL(common_with, std::, ) SYMBOL(comp_ellint_1, std::, ) SYMBOL(comp_ellint_1f, std::, ) SYMBOL(comp_ellint_1l, std::, ) SYMBOL(comp_ellint_2, std::, ) SYMBOL(comp_ellint_2f, std::, ) SYMBOL(comp_ellint_2l, std::, ) SYMBOL(comp_ellint_3, std::, ) SYMBOL(comp_ellint_3f, std::, ) SYMBOL(comp_ellint_3l, std::, ) SYMBOL(compare_partial_order_fallback, std::, ) SYMBOL(compare_strong_order_fallback, std::, ) SYMBOL(compare_three_way_result, std::, ) SYMBOL(compare_three_way_result_t, std::, ) SYMBOL(compare_weak_order_fallback, std::, ) SYMBOL(complex, std::, ) SYMBOL(condition_variable, std::, ) SYMBOL(condition_variable_any, std::, ) SYMBOL(conditional, std::, ) SYMBOL(conditional_t, std::, ) SYMBOL(conj, std::, ) SYMBOL(conjunction, std::, ) SYMBOL(conjunction_v, std::, ) SYMBOL(const_mem_fun1_ref_t, std::, ) SYMBOL(const_mem_fun1_t, std::, ) SYMBOL(const_mem_fun_ref_t, std::, ) SYMBOL(const_mem_fun_t, std::, ) SYMBOL(const_pointer_cast, std::, ) SYMBOL(construct_at, std::, ) SYMBOL(constructible_from, std::, ) SYMBOL(contiguous_iterator, std::, ) SYMBOL(contiguous_iterator_tag, std::, ) SYMBOL(convertible_to, std::, ) SYMBOL(copy, std::, ) SYMBOL(copy_backward, std::, ) SYMBOL(copy_constructible, std::, ) SYMBOL(copy_if, std::, ) SYMBOL(copy_n, std::, ) SYMBOL(copyable, std::, ) SYMBOL(copysign, std::, ) SYMBOL(copysign, None, ) SYMBOL(copysign, None, ) SYMBOL(copysignf, std::, ) SYMBOL(copysignf, None, ) SYMBOL(copysignf, None, ) SYMBOL(copysignl, std::, ) SYMBOL(copysignl, None, ) SYMBOL(copysignl, None, ) SYMBOL(coroutine_handle, std::, ) SYMBOL(coroutine_traits, std::, ) SYMBOL(cos, std::, ) SYMBOL(cos, None, ) SYMBOL(cos, None, ) SYMBOL(cosf, std::, ) SYMBOL(cosf, None, ) SYMBOL(cosf, None, ) SYMBOL(cosh, std::, ) SYMBOL(cosh, None, ) SYMBOL(cosh, None, ) SYMBOL(coshf, std::, ) SYMBOL(coshf, None, ) SYMBOL(coshf, None, ) SYMBOL(coshl, std::, ) SYMBOL(coshl, None, ) SYMBOL(coshl, None, ) SYMBOL(cosl, std::, ) SYMBOL(cosl, None, ) SYMBOL(cosl, None, ) SYMBOL(count, std::, ) SYMBOL(count_if, std::, ) SYMBOL(counted_iterator, std::, ) SYMBOL(counting_semaphore, std::, ) SYMBOL(countl_one, std::, ) SYMBOL(countl_zero, std::, ) SYMBOL(countr_one, std::, ) SYMBOL(countr_zero, std::, ) SYMBOL(cout, std::, ) SYMBOL(cref, std::, ) SYMBOL(cregex_iterator, std::, ) SYMBOL(cregex_token_iterator, std::, ) SYMBOL(csub_match, std::, ) SYMBOL(ctime, std::, ) SYMBOL(ctime, None, ) SYMBOL(ctime, None, ) SYMBOL(ctype, std::, ) SYMBOL(ctype_base, std::, ) SYMBOL(ctype_byname, std::, ) SYMBOL(current_exception, std::, ) SYMBOL(cv_status, std::, ) SYMBOL(cyl_bessel_i, std::, ) SYMBOL(cyl_bessel_if, std::, ) SYMBOL(cyl_bessel_il, std::, ) SYMBOL(cyl_bessel_j, std::, ) SYMBOL(cyl_bessel_jf, std::, ) SYMBOL(cyl_bessel_jl, std::, ) SYMBOL(cyl_bessel_k, std::, ) SYMBOL(cyl_bessel_kf, std::, ) SYMBOL(cyl_bessel_kl, std::, ) SYMBOL(cyl_neumann, std::, ) SYMBOL(cyl_neumannf, std::, ) SYMBOL(cyl_neumannl, std::, ) SYMBOL(dec, std::, ) SYMBOL(dec, std::, ) SYMBOL(deca, std::, ) SYMBOL(decay, std::, ) SYMBOL(decay_t, std::, ) SYMBOL(deci, std::, ) SYMBOL(declare_no_pointers, std::, ) SYMBOL(declare_reachable, std::, ) SYMBOL(declval, std::, ) SYMBOL(default_delete, std::, ) SYMBOL(default_initializable, std::, ) SYMBOL(default_random_engine, std::, ) SYMBOL(default_searcher, std::, ) SYMBOL(default_sentinel, std::, ) SYMBOL(default_sentinel_t, std::, ) SYMBOL(defaultfloat, std::, ) SYMBOL(defaultfloat, std::, ) SYMBOL(defer_lock, std::, ) SYMBOL(defer_lock_t, std::, ) SYMBOL(denorm_absent, std::, ) SYMBOL(denorm_indeterminate, std::, ) SYMBOL(denorm_present, std::, ) SYMBOL(deque, std::, ) SYMBOL(derived_from, std::, ) SYMBOL(destroy, std::, ) SYMBOL(destroy_at, std::, ) SYMBOL(destroy_n, std::, ) SYMBOL(destroying_delete, std::, ) SYMBOL(destroying_delete_t, std::, ) SYMBOL(destructible, std::, ) SYMBOL(difftime, std::, ) SYMBOL(difftime, None, ) SYMBOL(difftime, None, ) SYMBOL(disable_sized_sentinel_for, std::, ) SYMBOL(discard_block_engine, std::, ) SYMBOL(discrete_distribution, std::, ) SYMBOL(disjunction, std::, ) SYMBOL(disjunction_v, std::, ) SYMBOL(distance, std::, ) SYMBOL(div_t, std::, ) SYMBOL(div_t, None, ) SYMBOL(div_t, None, ) SYMBOL(divides, std::, ) SYMBOL(domain_error, std::, ) SYMBOL(double_t, std::, ) SYMBOL(double_t, None, ) SYMBOL(double_t, None, ) SYMBOL(dynamic_extent, std::, ) SYMBOL(dynamic_pointer_cast, std::, ) SYMBOL(ellint_1, std::, ) SYMBOL(ellint_1f, std::, ) SYMBOL(ellint_1l, std::, ) SYMBOL(ellint_2, std::, ) SYMBOL(ellint_2f, std::, ) SYMBOL(ellint_2l, std::, ) SYMBOL(ellint_3, std::, ) SYMBOL(ellint_3f, std::, ) SYMBOL(ellint_3l, std::, ) SYMBOL(emit_on_flush, std::, ) SYMBOL(emit_on_flush, std::, ) SYMBOL(enable_if, std::, ) SYMBOL(enable_if_t, std::, ) SYMBOL(enable_shared_from_this, std::, ) SYMBOL(endian, std::, ) SYMBOL(endl, std::, ) SYMBOL(endl, std::, ) SYMBOL(ends, std::, ) SYMBOL(ends, std::, ) SYMBOL(equal, std::, ) SYMBOL(equal_range, std::, ) SYMBOL(equal_to, std::, ) SYMBOL(equality_comparable, std::, ) SYMBOL(equality_comparable_with, std::, ) SYMBOL(equivalence_relation, std::, ) SYMBOL(erase, std::, ) SYMBOL(erase_if, std::, ) SYMBOL(erf, std::, ) SYMBOL(erf, None, ) SYMBOL(erf, None, ) SYMBOL(erfc, std::, ) SYMBOL(erfc, None, ) SYMBOL(erfc, None, ) SYMBOL(erfcf, std::, ) SYMBOL(erfcf, None, ) SYMBOL(erfcf, None, ) SYMBOL(erfcl, std::, ) SYMBOL(erfcl, None, ) SYMBOL(erfcl, None, ) SYMBOL(erff, std::, ) SYMBOL(erff, None, ) SYMBOL(erff, None, ) SYMBOL(erfl, std::, ) SYMBOL(erfl, None, ) SYMBOL(erfl, None, ) SYMBOL(errc, std::, ) SYMBOL(error_category, std::, ) SYMBOL(error_code, std::, ) SYMBOL(error_condition, std::, ) SYMBOL(exa, std::, ) SYMBOL(exception, std::, ) SYMBOL(exception_ptr, std::, ) SYMBOL(exchange, std::, ) SYMBOL(exclusive_scan, std::, ) SYMBOL(exit, std::, ) SYMBOL(exit, None, ) SYMBOL(exit, None, ) SYMBOL(exp, std::, ) SYMBOL(exp, None, ) SYMBOL(exp, None, ) SYMBOL(exp2, std::, ) SYMBOL(exp2, None, ) SYMBOL(exp2, None, ) SYMBOL(exp2f, std::, ) SYMBOL(exp2f, None, ) SYMBOL(exp2f, None, ) SYMBOL(exp2l, std::, ) SYMBOL(exp2l, None, ) SYMBOL(exp2l, None, ) SYMBOL(expf, std::, ) SYMBOL(expf, None, ) SYMBOL(expf, None, ) SYMBOL(expint, std::, ) SYMBOL(expintf, std::, ) SYMBOL(expintl, std::, ) SYMBOL(expl, std::, ) SYMBOL(expl, None, ) SYMBOL(expl, None, ) SYMBOL(expm1, std::, ) SYMBOL(expm1, None, ) SYMBOL(expm1, None, ) SYMBOL(expm1f, std::, ) SYMBOL(expm1f, None, ) SYMBOL(expm1f, None, ) SYMBOL(expm1l, std::, ) SYMBOL(expm1l, None, ) SYMBOL(expm1l, None, ) SYMBOL(exponential_distribution, std::, ) SYMBOL(extent, std::, ) SYMBOL(extent_v, std::, ) SYMBOL(extreme_value_distribution, std::, ) SYMBOL(fabs, std::, ) SYMBOL(fabs, None, ) SYMBOL(fabs, None, ) SYMBOL(fabsf, std::, ) SYMBOL(fabsf, None, ) SYMBOL(fabsf, None, ) SYMBOL(fabsl, std::, ) SYMBOL(fabsl, None, ) SYMBOL(fabsl, None, ) SYMBOL(false_type, std::, ) SYMBOL(fclose, std::, ) SYMBOL(fclose, None, ) SYMBOL(fclose, None, ) SYMBOL(fdim, std::, ) SYMBOL(fdim, None, ) SYMBOL(fdim, None, ) SYMBOL(fdimf, std::, ) SYMBOL(fdimf, None, ) SYMBOL(fdimf, None, ) SYMBOL(fdiml, std::, ) SYMBOL(fdiml, None, ) SYMBOL(fdiml, None, ) SYMBOL(feclearexcept, std::, ) SYMBOL(feclearexcept, None, ) SYMBOL(feclearexcept, None, ) SYMBOL(fegetenv, std::, ) SYMBOL(fegetenv, None, ) SYMBOL(fegetenv, None, ) SYMBOL(fegetexceptflag, std::, ) SYMBOL(fegetexceptflag, None, ) SYMBOL(fegetexceptflag, None, ) SYMBOL(fegetround, std::, ) SYMBOL(fegetround, None, ) SYMBOL(fegetround, None, ) SYMBOL(feholdexcept, std::, ) SYMBOL(feholdexcept, None, ) SYMBOL(feholdexcept, None, ) SYMBOL(femto, std::, ) SYMBOL(fenv_t, std::, ) SYMBOL(fenv_t, None, ) SYMBOL(fenv_t, None, ) SYMBOL(feof, std::, ) SYMBOL(feof, None, ) SYMBOL(feof, None, ) SYMBOL(feraiseexcept, std::, ) SYMBOL(feraiseexcept, None, ) SYMBOL(feraiseexcept, None, ) SYMBOL(ferror, std::, ) SYMBOL(ferror, None, ) SYMBOL(ferror, None, ) SYMBOL(fesetenv, std::, ) SYMBOL(fesetenv, None, ) SYMBOL(fesetenv, None, ) SYMBOL(fesetexceptflag, std::, ) SYMBOL(fesetexceptflag, None, ) SYMBOL(fesetexceptflag, None, ) SYMBOL(fesetround, std::, ) SYMBOL(fesetround, None, ) SYMBOL(fesetround, None, ) SYMBOL(fetestexcept, std::, ) SYMBOL(fetestexcept, None, ) SYMBOL(fetestexcept, None, ) SYMBOL(feupdateenv, std::, ) SYMBOL(feupdateenv, None, ) SYMBOL(feupdateenv, None, ) SYMBOL(fexcept_t, std::, ) SYMBOL(fexcept_t, None, ) SYMBOL(fexcept_t, None, ) SYMBOL(fflush, std::, ) SYMBOL(fflush, None, ) SYMBOL(fflush, None, ) SYMBOL(fgetc, std::, ) SYMBOL(fgetc, None, ) SYMBOL(fgetc, None, ) SYMBOL(fgetpos, std::, ) SYMBOL(fgetpos, None, ) SYMBOL(fgetpos, None, ) SYMBOL(fgets, std::, ) SYMBOL(fgets, None, ) SYMBOL(fgets, None, ) SYMBOL(fgetwc, std::, ) SYMBOL(fgetwc, None, ) SYMBOL(fgetwc, None, ) SYMBOL(fgetws, std::, ) SYMBOL(fgetws, None, ) SYMBOL(fgetws, None, ) SYMBOL(filebuf, std::, ) SYMBOL(filebuf, std::, ) SYMBOL(filebuf, std::, ) SYMBOL(fill, std::, ) SYMBOL(fill_n, std::, ) SYMBOL(find, std::, ) SYMBOL(find_end, std::, ) SYMBOL(find_first_of, std::, ) SYMBOL(find_if, std::, ) SYMBOL(find_if_not, std::, ) SYMBOL(fisher_f_distribution, std::, ) SYMBOL(fixed, std::, ) SYMBOL(fixed, std::, ) SYMBOL(float_denorm_style, std::, ) SYMBOL(float_round_style, std::, ) SYMBOL(float_t, std::, ) SYMBOL(float_t, None, ) SYMBOL(float_t, None, ) SYMBOL(floating_point, std::, ) SYMBOL(floor, std::, ) SYMBOL(floor, None, ) SYMBOL(floor, None, ) SYMBOL(floorf, std::, ) SYMBOL(floorf, None, ) SYMBOL(floorf, None, ) SYMBOL(floorl, std::, ) SYMBOL(floorl, None, ) SYMBOL(floorl, None, ) SYMBOL(flush, std::, ) SYMBOL(flush, std::, ) SYMBOL(flush_emit, std::, ) SYMBOL(flush_emit, std::, ) SYMBOL(fma, std::, ) SYMBOL(fma, None, ) SYMBOL(fma, None, ) SYMBOL(fmaf, std::, ) SYMBOL(fmaf, None, ) SYMBOL(fmaf, None, ) SYMBOL(fmal, std::, ) SYMBOL(fmal, None, ) SYMBOL(fmal, None, ) SYMBOL(fmax, std::, ) SYMBOL(fmax, None, ) SYMBOL(fmax, None, ) SYMBOL(fmaxf, std::, ) SYMBOL(fmaxf, None, ) SYMBOL(fmaxf, None, ) SYMBOL(fmaxl, std::, ) SYMBOL(fmaxl, None, ) SYMBOL(fmaxl, None, ) SYMBOL(fmin, std::, ) SYMBOL(fmin, None, ) SYMBOL(fmin, None, ) SYMBOL(fminf, std::, ) SYMBOL(fminf, None, ) SYMBOL(fminf, None, ) SYMBOL(fminl, std::, ) SYMBOL(fminl, None, ) SYMBOL(fminl, None, ) SYMBOL(fmod, std::, ) SYMBOL(fmod, None, ) SYMBOL(fmod, None, ) SYMBOL(fmodf, std::, ) SYMBOL(fmodf, None, ) SYMBOL(fmodf, None, ) SYMBOL(fmodl, std::, ) SYMBOL(fmodl, None, ) SYMBOL(fmodl, None, ) SYMBOL(fopen, std::, ) SYMBOL(fopen, None, ) SYMBOL(fopen, None, ) SYMBOL(for_each, std::, ) SYMBOL(for_each_n, std::, ) SYMBOL(format, std::, ) SYMBOL(format_args, std::, ) SYMBOL(format_context, std::, ) SYMBOL(format_error, std::, ) SYMBOL(format_parse_context, std::, ) SYMBOL(format_to, std::, ) SYMBOL(format_to_n, std::, ) SYMBOL(format_to_n_result, std::, ) SYMBOL(formatted_size, std::, ) SYMBOL(formatter, std::, ) SYMBOL(forward, std::, ) SYMBOL(forward_as_tuple, std::, ) SYMBOL(forward_iterator, std::, ) SYMBOL(forward_iterator_tag, std::, ) SYMBOL(forward_like, std::, ) SYMBOL(forward_list, std::, ) SYMBOL(fpclassify, std::, ) SYMBOL(fpclassify, None, ) SYMBOL(fpclassify, None, ) SYMBOL(fpos, std::, ) SYMBOL(fpos, std::, ) SYMBOL(fpos, std::, ) SYMBOL(fpos_t, std::, ) SYMBOL(fpos_t, None, ) SYMBOL(fpos_t, None, ) SYMBOL(fprintf, std::, ) SYMBOL(fprintf, None, ) SYMBOL(fprintf, None, ) SYMBOL(fputc, std::, ) SYMBOL(fputc, None, ) SYMBOL(fputc, None, ) SYMBOL(fputs, std::, ) SYMBOL(fputs, None, ) SYMBOL(fputs, None, ) SYMBOL(fputwc, std::, ) SYMBOL(fputwc, None, ) SYMBOL(fputwc, None, ) SYMBOL(fputws, std::, ) SYMBOL(fputws, None, ) SYMBOL(fputws, None, ) SYMBOL(fread, std::, ) SYMBOL(fread, None, ) SYMBOL(fread, None, ) SYMBOL(free, std::, ) SYMBOL(free, None, ) SYMBOL(free, None, ) SYMBOL(freopen, std::, ) SYMBOL(freopen, None, ) SYMBOL(freopen, None, ) SYMBOL(frexp, std::, ) SYMBOL(frexp, None, ) SYMBOL(frexp, None, ) SYMBOL(frexpf, std::, ) SYMBOL(frexpf, None, ) SYMBOL(frexpf, None, ) SYMBOL(frexpl, std::, ) SYMBOL(frexpl, None, ) SYMBOL(frexpl, None, ) SYMBOL(from_chars, std::, ) SYMBOL(from_chars_result, std::, ) SYMBOL(from_range, std::, ) SYMBOL(from_range_t, std::, ) SYMBOL(front_insert_iterator, std::, ) SYMBOL(front_inserter, std::, ) SYMBOL(fscanf, std::, ) SYMBOL(fscanf, None, ) SYMBOL(fscanf, None, ) SYMBOL(fseek, std::, ) SYMBOL(fseek, None, ) SYMBOL(fseek, None, ) SYMBOL(fsetpos, std::, ) SYMBOL(fsetpos, None, ) SYMBOL(fsetpos, None, ) SYMBOL(fstream, std::, ) SYMBOL(fstream, std::, ) SYMBOL(ftell, std::, ) SYMBOL(ftell, None, ) SYMBOL(ftell, None, ) SYMBOL(function, std::, ) SYMBOL(future, std::, ) SYMBOL(future_category, std::, ) SYMBOL(future_errc, std::, ) SYMBOL(future_error, std::, ) SYMBOL(future_status, std::, ) SYMBOL(fwide, std::, ) SYMBOL(fwide, None, ) SYMBOL(fwide, None, ) SYMBOL(fwprintf, std::, ) SYMBOL(fwprintf, None, ) SYMBOL(fwprintf, None, ) SYMBOL(fwrite, std::, ) SYMBOL(fwrite, None, ) SYMBOL(fwrite, None, ) SYMBOL(fwscanf, std::, ) SYMBOL(fwscanf, None, ) SYMBOL(fwscanf, None, ) SYMBOL(gamma_distribution, std::, ) SYMBOL(gcd, std::, ) SYMBOL(generate, std::, ) SYMBOL(generate_canonical, std::, ) SYMBOL(generate_n, std::, ) SYMBOL(generic_category, std::, ) SYMBOL(geometric_distribution, std::, ) SYMBOL(get_deleter, std::, ) SYMBOL(get_if, std::, ) SYMBOL(get_money, std::, ) SYMBOL(get_new_handler, std::, ) SYMBOL(get_pointer_safety, std::, ) SYMBOL(get_temporary_buffer, std::, ) SYMBOL(get_terminate, std::, ) SYMBOL(get_time, std::, ) SYMBOL(get_unexpected, std::, ) SYMBOL(getc, std::, ) SYMBOL(getc, None, ) SYMBOL(getc, None, ) SYMBOL(getchar, std::, ) SYMBOL(getchar, None, ) SYMBOL(getchar, None, ) SYMBOL(getenv, std::, ) SYMBOL(getenv, None, ) SYMBOL(getenv, None, ) SYMBOL(getline, std::, ) SYMBOL(gets, std::, ) SYMBOL(gets, None, ) SYMBOL(gets, None, ) SYMBOL(getwc, std::, ) SYMBOL(getwc, None, ) SYMBOL(getwc, None, ) SYMBOL(getwchar, std::, ) SYMBOL(getwchar, None, ) SYMBOL(getwchar, None, ) SYMBOL(giga, std::, ) SYMBOL(gmtime, std::, ) SYMBOL(gmtime, None, ) SYMBOL(gmtime, None, ) SYMBOL(greater, std::, ) SYMBOL(greater_equal, std::, ) SYMBOL(gslice, std::, ) SYMBOL(gslice_array, std::, ) SYMBOL(hardware_constructive_interference_size, std::, ) SYMBOL(hardware_destructive_interference_size, std::, ) SYMBOL(has_facet, std::, ) SYMBOL(has_single_bit, std::, ) SYMBOL(has_unique_object_representations, std::, ) SYMBOL(has_unique_object_representations_v, std::, ) SYMBOL(has_virtual_destructor, std::, ) SYMBOL(has_virtual_destructor_v, std::, ) SYMBOL(hash, std::, ) SYMBOL(hecto, std::, ) SYMBOL(hermite, std::, ) SYMBOL(hermitef, std::, ) SYMBOL(hermitel, std::, ) SYMBOL(hex, std::, ) SYMBOL(hex, std::, ) SYMBOL(hexfloat, std::, ) SYMBOL(hexfloat, std::, ) SYMBOL(holds_alternative, std::, ) SYMBOL(hypot, std::, ) SYMBOL(hypot, None, ) SYMBOL(hypot, None, ) SYMBOL(hypotf, std::, ) SYMBOL(hypotf, None, ) SYMBOL(hypotf, None, ) SYMBOL(hypotl, std::, ) SYMBOL(hypotl, None, ) SYMBOL(hypotl, None, ) SYMBOL(identity, std::, ) SYMBOL(ifstream, std::, ) SYMBOL(ifstream, std::, ) SYMBOL(ignore, std::, ) SYMBOL(ilogb, std::, ) SYMBOL(ilogb, None, ) SYMBOL(ilogb, None, ) SYMBOL(ilogbf, std::, ) SYMBOL(ilogbf, None, ) SYMBOL(ilogbf, None, ) SYMBOL(ilogbl, std::, ) SYMBOL(ilogbl, None, ) SYMBOL(ilogbl, None, ) SYMBOL(imag, std::, ) SYMBOL(imaxabs, std::, ) SYMBOL(imaxabs, None, ) SYMBOL(imaxabs, None, ) SYMBOL(imaxdiv, std::, ) SYMBOL(imaxdiv, None, ) SYMBOL(imaxdiv, None, ) SYMBOL(imaxdiv_t, std::, ) SYMBOL(imaxdiv_t, None, ) SYMBOL(imaxdiv_t, None, ) SYMBOL(in_place, std::, ) SYMBOL(in_place_index, std::, ) SYMBOL(in_place_index_t, std::, ) SYMBOL(in_place_t, std::, ) SYMBOL(in_place_type, std::, ) SYMBOL(in_place_type_t, std::, ) SYMBOL(in_range, std::, ) SYMBOL(includes, std::, ) SYMBOL(inclusive_scan, std::, ) SYMBOL(incrementable, std::, ) SYMBOL(incrementable_traits, std::, ) SYMBOL(independent_bits_engine, std::, ) SYMBOL(indirect_array, std::, ) SYMBOL(indirect_binary_predicate, std::, ) SYMBOL(indirect_equivalence_relation, std::, ) SYMBOL(indirect_result_t, std::, ) SYMBOL(indirect_strict_weak_order, std::, ) SYMBOL(indirect_unary_predicate, std::, ) SYMBOL(indirectly_comparable, std::, ) SYMBOL(indirectly_copyable, std::, ) SYMBOL(indirectly_copyable_storable, std::, ) SYMBOL(indirectly_movable, std::, ) SYMBOL(indirectly_movable_storable, std::, ) SYMBOL(indirectly_readable, std::, ) SYMBOL(indirectly_readable_traits, std::, ) SYMBOL(indirectly_regular_unary_invocable, std::, ) SYMBOL(indirectly_swappable, std::, ) SYMBOL(indirectly_unary_invocable, std::, ) SYMBOL(indirectly_writable, std::, ) SYMBOL(initializer_list, std::, ) SYMBOL(inner_product, std::, ) SYMBOL(inout_ptr, std::, ) SYMBOL(inout_ptr_t, std::, ) SYMBOL(inplace_merge, std::, ) SYMBOL(input_iterator, std::, ) SYMBOL(input_iterator_tag, std::, ) SYMBOL(input_or_output_iterator, std::, ) SYMBOL(insert_iterator, std::, ) SYMBOL(inserter, std::, ) SYMBOL(int16_t, std::, ) SYMBOL(int16_t, None, ) SYMBOL(int16_t, None, ) SYMBOL(int32_t, std::, ) SYMBOL(int32_t, None, ) SYMBOL(int32_t, None, ) SYMBOL(int64_t, std::, ) SYMBOL(int64_t, None, ) SYMBOL(int64_t, None, ) SYMBOL(int8_t, std::, ) SYMBOL(int8_t, None, ) SYMBOL(int8_t, None, ) SYMBOL(int_fast16_t, std::, ) SYMBOL(int_fast16_t, None, ) SYMBOL(int_fast16_t, None, ) SYMBOL(int_fast32_t, std::, ) SYMBOL(int_fast32_t, None, ) SYMBOL(int_fast32_t, None, ) SYMBOL(int_fast64_t, std::, ) SYMBOL(int_fast64_t, None, ) SYMBOL(int_fast64_t, None, ) SYMBOL(int_fast8_t, std::, ) SYMBOL(int_fast8_t, None, ) SYMBOL(int_fast8_t, None, ) SYMBOL(int_least16_t, std::, ) SYMBOL(int_least16_t, None, ) SYMBOL(int_least16_t, None, ) SYMBOL(int_least32_t, std::, ) SYMBOL(int_least32_t, None, ) SYMBOL(int_least32_t, None, ) SYMBOL(int_least64_t, std::, ) SYMBOL(int_least64_t, None, ) SYMBOL(int_least64_t, None, ) SYMBOL(int_least8_t, std::, ) SYMBOL(int_least8_t, None, ) SYMBOL(int_least8_t, None, ) SYMBOL(integer_sequence, std::, ) SYMBOL(integral, std::, ) SYMBOL(integral_constant, std::, ) SYMBOL(internal, std::, ) SYMBOL(internal, std::, ) SYMBOL(intmax_t, std::, ) SYMBOL(intmax_t, None, ) SYMBOL(intmax_t, None, ) SYMBOL(intptr_t, std::, ) SYMBOL(intptr_t, None, ) SYMBOL(intptr_t, None, ) SYMBOL(invalid_argument, std::, ) SYMBOL(invocable, std::, ) SYMBOL(invoke, std::, ) SYMBOL(invoke_r, std::, ) SYMBOL(invoke_result, std::, ) SYMBOL(invoke_result_t, std::, ) SYMBOL(io_errc, std::, ) SYMBOL(io_errc, std::, ) SYMBOL(io_state, std::, ) SYMBOL(io_state, std::, ) SYMBOL(ios, std::, ) SYMBOL(ios, std::, ) SYMBOL(ios, std::, ) SYMBOL(ios_base, std::, ) SYMBOL(ios_base, std::, ) SYMBOL(iostream, std::, ) SYMBOL(iostream, std::, ) SYMBOL(iostream, std::, ) SYMBOL(iostream_category, std::, ) SYMBOL(iostream_category, std::, ) SYMBOL(iota, std::, ) SYMBOL(is_abstract, std::, ) SYMBOL(is_abstract_v, std::, ) SYMBOL(is_aggregate, std::, ) SYMBOL(is_aggregate_v, std::, ) SYMBOL(is_arithmetic, std::, ) SYMBOL(is_arithmetic_v, std::, ) SYMBOL(is_array, std::, ) SYMBOL(is_array_v, std::, ) SYMBOL(is_assignable, std::, ) SYMBOL(is_assignable_v, std::, ) SYMBOL(is_base_of, std::, ) SYMBOL(is_base_of_v, std::, ) SYMBOL(is_bind_expression, std::, ) SYMBOL(is_bind_expression_v, std::, ) SYMBOL(is_bounded_array, std::, ) SYMBOL(is_bounded_array_v, std::, ) SYMBOL(is_class, std::, ) SYMBOL(is_class_v, std::, ) SYMBOL(is_compound, std::, ) SYMBOL(is_compound_v, std::, ) SYMBOL(is_const, std::, ) SYMBOL(is_const_v, std::, ) SYMBOL(is_constant_evaluated, std::, ) SYMBOL(is_constructible, std::, ) SYMBOL(is_constructible_v, std::, ) SYMBOL(is_convertible, std::, ) SYMBOL(is_convertible_v, std::, ) SYMBOL(is_copy_assignable, std::, ) SYMBOL(is_copy_assignable_v, std::, ) SYMBOL(is_copy_constructible, std::, ) SYMBOL(is_copy_constructible_v, std::, ) SYMBOL(is_corresponding_member, std::, ) SYMBOL(is_default_constructible, std::, ) SYMBOL(is_default_constructible_v, std::, ) SYMBOL(is_destructible, std::, ) SYMBOL(is_destructible_v, std::, ) SYMBOL(is_empty, std::, ) SYMBOL(is_empty_v, std::, ) SYMBOL(is_enum, std::, ) SYMBOL(is_enum_v, std::, ) SYMBOL(is_eq, std::, ) SYMBOL(is_error_code_enum, std::, ) SYMBOL(is_error_condition_enum, std::, ) SYMBOL(is_error_condition_enum_v, std::, ) SYMBOL(is_execution_policy, std::, ) SYMBOL(is_execution_policy_v, std::, ) SYMBOL(is_final, std::, ) SYMBOL(is_final_v, std::, ) SYMBOL(is_floating_point, std::, ) SYMBOL(is_floating_point_v, std::, ) SYMBOL(is_function, std::, ) SYMBOL(is_function_v, std::, ) SYMBOL(is_fundamental, std::, ) SYMBOL(is_fundamental_v, std::, ) SYMBOL(is_gt, std::, ) SYMBOL(is_gteq, std::, ) SYMBOL(is_heap, std::, ) SYMBOL(is_heap_until, std::, ) SYMBOL(is_integral, std::, ) SYMBOL(is_integral_v, std::, ) SYMBOL(is_invocable, std::, ) SYMBOL(is_invocable_r, std::, ) SYMBOL(is_invocable_r_v, std::, ) SYMBOL(is_invocable_v, std::, ) SYMBOL(is_layout_compatible, std::, ) SYMBOL(is_layout_compatible_v, std::, ) SYMBOL(is_literal_type, std::, ) SYMBOL(is_literal_type_v, std::, ) SYMBOL(is_lt, std::, ) SYMBOL(is_lteq, std::, ) SYMBOL(is_lvalue_reference, std::, ) SYMBOL(is_lvalue_reference_v, std::, ) SYMBOL(is_member_function_pointer, std::, ) SYMBOL(is_member_function_pointer_v, std::, ) SYMBOL(is_member_object_pointer, std::, ) SYMBOL(is_member_object_pointer_v, std::, ) SYMBOL(is_member_pointer, std::, ) SYMBOL(is_member_pointer_v, std::, ) SYMBOL(is_move_assignable, std::, ) SYMBOL(is_move_assignable_v, std::, ) SYMBOL(is_move_constructible, std::, ) SYMBOL(is_move_constructible_v, std::, ) SYMBOL(is_neq, std::, ) SYMBOL(is_nothrow_assignable, std::, ) SYMBOL(is_nothrow_assignable_v, std::, ) SYMBOL(is_nothrow_constructible, std::, ) SYMBOL(is_nothrow_constructible_v, std::, ) SYMBOL(is_nothrow_convertible, std::, ) SYMBOL(is_nothrow_convertible_v, std::, ) SYMBOL(is_nothrow_copy_assignable, std::, ) SYMBOL(is_nothrow_copy_assignable_v, std::, ) SYMBOL(is_nothrow_copy_constructible, std::, ) SYMBOL(is_nothrow_copy_constructible_v, std::, ) SYMBOL(is_nothrow_default_constructible, std::, ) SYMBOL(is_nothrow_default_constructible_v, std::, ) SYMBOL(is_nothrow_destructible, std::, ) SYMBOL(is_nothrow_destructible_v, std::, ) SYMBOL(is_nothrow_invocable, std::, ) SYMBOL(is_nothrow_invocable_r, std::, ) SYMBOL(is_nothrow_invocable_r_v, std::, ) SYMBOL(is_nothrow_invocable_v, std::, ) SYMBOL(is_nothrow_move_assignable, std::, ) SYMBOL(is_nothrow_move_assignable_v, std::, ) SYMBOL(is_nothrow_move_constructible, std::, ) SYMBOL(is_nothrow_move_constructible_v, std::, ) SYMBOL(is_nothrow_swappable, std::, ) SYMBOL(is_nothrow_swappable_v, std::, ) SYMBOL(is_nothrow_swappable_with, std::, ) SYMBOL(is_nothrow_swappable_with_v, std::, ) SYMBOL(is_null_pointer, std::, ) SYMBOL(is_null_pointer_v, std::, ) SYMBOL(is_object, std::, ) SYMBOL(is_object_v, std::, ) SYMBOL(is_partitioned, std::, ) SYMBOL(is_permutation, std::, ) SYMBOL(is_placeholder, std::, ) SYMBOL(is_placeholder_v, std::, ) SYMBOL(is_pod, std::, ) SYMBOL(is_pod_v, std::, ) SYMBOL(is_pointer, std::, ) SYMBOL(is_pointer_interconvertible_base_of, std::, ) SYMBOL(is_pointer_interconvertible_base_of_v, std::, ) SYMBOL(is_pointer_interconvertible_with_class, std::, ) SYMBOL(is_pointer_v, std::, ) SYMBOL(is_polymorphic, std::, ) SYMBOL(is_polymorphic_v, std::, ) SYMBOL(is_reference, std::, ) SYMBOL(is_reference_v, std::, ) SYMBOL(is_rvalue_reference, std::, ) SYMBOL(is_rvalue_reference_v, std::, ) SYMBOL(is_same, std::, ) SYMBOL(is_same_v, std::, ) SYMBOL(is_scalar, std::, ) SYMBOL(is_scalar_v, std::, ) SYMBOL(is_scoped_enum, std::, ) SYMBOL(is_scoped_enum_v, std::, ) SYMBOL(is_signed, std::, ) SYMBOL(is_signed_v, std::, ) SYMBOL(is_sorted, std::, ) SYMBOL(is_sorted_until, std::, ) SYMBOL(is_standard_layout, std::, ) SYMBOL(is_standard_layout_v, std::, ) SYMBOL(is_swappable, std::, ) SYMBOL(is_swappable_v, std::, ) SYMBOL(is_swappable_with, std::, ) SYMBOL(is_swappable_with_v, std::, ) SYMBOL(is_trivial, std::, ) SYMBOL(is_trivial_v, std::, ) SYMBOL(is_trivially_assignable, std::, ) SYMBOL(is_trivially_assignable_v, std::, ) SYMBOL(is_trivially_constructible, std::, ) SYMBOL(is_trivially_constructible_v, std::, ) SYMBOL(is_trivially_copy_assignable, std::, ) SYMBOL(is_trivially_copy_assignable_v, std::, ) SYMBOL(is_trivially_copy_constructible, std::, ) SYMBOL(is_trivially_copy_constructible_v, std::, ) SYMBOL(is_trivially_copyable, std::, ) SYMBOL(is_trivially_copyable_v, std::, ) SYMBOL(is_trivially_default_constructible, std::, ) SYMBOL(is_trivially_default_constructible_v, std::, ) SYMBOL(is_trivially_destructible, std::, ) SYMBOL(is_trivially_destructible_v, std::, ) SYMBOL(is_trivially_move_assignable, std::, ) SYMBOL(is_trivially_move_assignable_v, std::, ) SYMBOL(is_trivially_move_constructible, std::, ) SYMBOL(is_trivially_move_constructible_v, std::, ) SYMBOL(is_unbounded_array, std::, ) SYMBOL(is_unbounded_array_v, std::, ) SYMBOL(is_union, std::, ) SYMBOL(is_union_v, std::, ) SYMBOL(is_unsigned, std::, ) SYMBOL(is_unsigned_v, std::, ) SYMBOL(is_void, std::, ) SYMBOL(is_void_v, std::, ) SYMBOL(is_volatile, std::, ) SYMBOL(is_volatile_v, std::, ) SYMBOL(isalnum, std::, ) SYMBOL(isalnum, None, ) SYMBOL(isalnum, None, ) SYMBOL(isalpha, std::, ) SYMBOL(isalpha, None, ) SYMBOL(isalpha, None, ) SYMBOL(isblank, std::, ) SYMBOL(isblank, None, ) SYMBOL(isblank, None, ) SYMBOL(iscntrl, std::, ) SYMBOL(iscntrl, None, ) SYMBOL(iscntrl, None, ) SYMBOL(isdigit, std::, ) SYMBOL(isdigit, None, ) SYMBOL(isdigit, None, ) SYMBOL(isfinite, std::, ) SYMBOL(isfinite, None, ) SYMBOL(isfinite, None, ) SYMBOL(isgraph, std::, ) SYMBOL(isgraph, None, ) SYMBOL(isgraph, None, ) SYMBOL(isgreater, std::, ) SYMBOL(isgreater, None, ) SYMBOL(isgreater, None, ) SYMBOL(isgreaterequal, std::, ) SYMBOL(isgreaterequal, None, ) SYMBOL(isgreaterequal, None, ) SYMBOL(isinf, std::, ) SYMBOL(isinf, None, ) SYMBOL(isinf, None, ) SYMBOL(isless, std::, ) SYMBOL(isless, None, ) SYMBOL(isless, None, ) SYMBOL(islessequal, std::, ) SYMBOL(islessequal, None, ) SYMBOL(islessequal, None, ) SYMBOL(islessgreater, std::, ) SYMBOL(islessgreater, None, ) SYMBOL(islessgreater, None, ) SYMBOL(islower, std::, ) SYMBOL(islower, None, ) SYMBOL(islower, None, ) SYMBOL(isnan, std::, ) SYMBOL(isnan, None, ) SYMBOL(isnan, None, ) SYMBOL(isnormal, std::, ) SYMBOL(isnormal, None, ) SYMBOL(isnormal, None, ) SYMBOL(ispanstream, std::, ) SYMBOL(ispanstream, std::, ) SYMBOL(isprint, std::, ) SYMBOL(isprint, None, ) SYMBOL(isprint, None, ) SYMBOL(ispunct, std::, ) SYMBOL(ispunct, None, ) SYMBOL(ispunct, None, ) SYMBOL(isspace, std::, ) SYMBOL(isspace, None, ) SYMBOL(isspace, None, ) SYMBOL(istream, std::, ) SYMBOL(istream, std::, ) SYMBOL(istream, std::, ) SYMBOL(istream_iterator, std::, ) SYMBOL(istreambuf_iterator, std::, ) SYMBOL(istreambuf_iterator, std::, ) SYMBOL(istringstream, std::, ) SYMBOL(istringstream, std::, ) SYMBOL(istrstream, std::, ) SYMBOL(isunordered, std::, ) SYMBOL(isunordered, None, ) SYMBOL(isunordered, None, ) SYMBOL(isupper, std::, ) SYMBOL(isupper, None, ) SYMBOL(isupper, None, ) SYMBOL(iswalnum, std::, ) SYMBOL(iswalnum, None, ) SYMBOL(iswalnum, None, ) SYMBOL(iswalpha, std::, ) SYMBOL(iswalpha, None, ) SYMBOL(iswalpha, None, ) SYMBOL(iswblank, std::, ) SYMBOL(iswblank, None, ) SYMBOL(iswblank, None, ) SYMBOL(iswcntrl, std::, ) SYMBOL(iswcntrl, None, ) SYMBOL(iswcntrl, None, ) SYMBOL(iswctype, std::, ) SYMBOL(iswctype, None, ) SYMBOL(iswctype, None, ) SYMBOL(iswdigit, std::, ) SYMBOL(iswdigit, None, ) SYMBOL(iswdigit, None, ) SYMBOL(iswgraph, std::, ) SYMBOL(iswgraph, None, ) SYMBOL(iswgraph, None, ) SYMBOL(iswlower, std::, ) SYMBOL(iswlower, None, ) SYMBOL(iswlower, None, ) SYMBOL(iswprint, std::, ) SYMBOL(iswprint, None, ) SYMBOL(iswprint, None, ) SYMBOL(iswpunct, std::, ) SYMBOL(iswpunct, None, ) SYMBOL(iswpunct, None, ) SYMBOL(iswspace, std::, ) SYMBOL(iswspace, None, ) SYMBOL(iswspace, None, ) SYMBOL(iswupper, std::, ) SYMBOL(iswupper, None, ) SYMBOL(iswupper, None, ) SYMBOL(iswxdigit, std::, ) SYMBOL(iswxdigit, None, ) SYMBOL(iswxdigit, None, ) SYMBOL(isxdigit, std::, ) SYMBOL(isxdigit, None, ) SYMBOL(isxdigit, None, ) SYMBOL(iter_common_reference_t, std::, ) SYMBOL(iter_const_reference_t, std::, ) SYMBOL(iter_difference_t, std::, ) SYMBOL(iter_reference_t, std::, ) SYMBOL(iter_rvalue_reference_t, std::, ) SYMBOL(iter_swap, std::, ) SYMBOL(iter_value_t, std::, ) SYMBOL(iterator, std::, ) SYMBOL(iterator_traits, std::, ) SYMBOL(jmp_buf, std::, ) SYMBOL(jmp_buf, None, ) SYMBOL(jmp_buf, None, ) SYMBOL(jthread, std::, ) SYMBOL(kill_dependency, std::, ) SYMBOL(kilo, std::, ) SYMBOL(knuth_b, std::, ) SYMBOL(labs, std::, ) SYMBOL(labs, None, ) SYMBOL(labs, None, ) SYMBOL(laguerre, std::, ) SYMBOL(laguerref, std::, ) SYMBOL(laguerrel, std::, ) SYMBOL(latch, std::, ) SYMBOL(launch, std::, ) SYMBOL(launder, std::, ) SYMBOL(lcm, std::, ) SYMBOL(lconv, std::, ) SYMBOL(lconv, None, ) SYMBOL(lconv, None, ) SYMBOL(ldexp, std::, ) SYMBOL(ldexp, None, ) SYMBOL(ldexp, None, ) SYMBOL(ldexpf, std::, ) SYMBOL(ldexpf, None, ) SYMBOL(ldexpf, None, ) SYMBOL(ldexpl, std::, ) SYMBOL(ldexpl, None, ) SYMBOL(ldexpl, None, ) SYMBOL(ldiv, std::, ) SYMBOL(ldiv, None, ) SYMBOL(ldiv, None, ) SYMBOL(ldiv_t, std::, ) SYMBOL(ldiv_t, None, ) SYMBOL(ldiv_t, None, ) SYMBOL(left, std::, ) SYMBOL(left, std::, ) SYMBOL(legendre, std::, ) SYMBOL(legendref, std::, ) SYMBOL(legendrel, std::, ) SYMBOL(length_error, std::, ) SYMBOL(lerp, std::, ) SYMBOL(less, std::, ) SYMBOL(less_equal, std::, ) SYMBOL(lexicographical_compare, std::, ) SYMBOL(lexicographical_compare_three_way, std::, ) SYMBOL(lgamma, std::, ) SYMBOL(lgamma, None, ) SYMBOL(lgamma, None, ) SYMBOL(lgammaf, std::, ) SYMBOL(lgammaf, None, ) SYMBOL(lgammaf, None, ) SYMBOL(lgammal, std::, ) SYMBOL(lgammal, None, ) SYMBOL(lgammal, None, ) SYMBOL(linear_congruential_engine, std::, ) SYMBOL(list, std::, ) SYMBOL(llabs, std::, ) SYMBOL(llabs, None, ) SYMBOL(llabs, None, ) SYMBOL(lldiv, std::, ) SYMBOL(lldiv, None, ) SYMBOL(lldiv, None, ) SYMBOL(lldiv_t, std::, ) SYMBOL(lldiv_t, None, ) SYMBOL(lldiv_t, None, ) SYMBOL(llrint, std::, ) SYMBOL(llrint, None, ) SYMBOL(llrint, None, ) SYMBOL(llrintf, std::, ) SYMBOL(llrintf, None, ) SYMBOL(llrintf, None, ) SYMBOL(llrintl, std::, ) SYMBOL(llrintl, None, ) SYMBOL(llrintl, None, ) SYMBOL(llround, std::, ) SYMBOL(llround, None, ) SYMBOL(llround, None, ) SYMBOL(llroundf, std::, ) SYMBOL(llroundf, None, ) SYMBOL(llroundf, None, ) SYMBOL(llroundl, std::, ) SYMBOL(llroundl, None, ) SYMBOL(llroundl, None, ) SYMBOL(locale, std::, ) SYMBOL(localeconv, std::, ) SYMBOL(localeconv, None, ) SYMBOL(localeconv, None, ) SYMBOL(localtime, std::, ) SYMBOL(localtime, None, ) SYMBOL(localtime, None, ) SYMBOL(lock, std::, ) SYMBOL(lock_guard, std::, ) SYMBOL(log, std::, ) SYMBOL(log, None, ) SYMBOL(log, None, ) SYMBOL(log10, std::, ) SYMBOL(log10, None, ) SYMBOL(log10, None, ) SYMBOL(log10f, std::, ) SYMBOL(log10f, None, ) SYMBOL(log10f, None, ) SYMBOL(log10l, std::, ) SYMBOL(log10l, None, ) SYMBOL(log10l, None, ) SYMBOL(log1p, std::, ) SYMBOL(log1p, None, ) SYMBOL(log1p, None, ) SYMBOL(log1pf, std::, ) SYMBOL(log1pf, None, ) SYMBOL(log1pf, None, ) SYMBOL(log1pl, std::, ) SYMBOL(log1pl, None, ) SYMBOL(log1pl, None, ) SYMBOL(log2, std::, ) SYMBOL(log2, None, ) SYMBOL(log2, None, ) SYMBOL(log2f, std::, ) SYMBOL(log2f, None, ) SYMBOL(log2f, None, ) SYMBOL(log2l, std::, ) SYMBOL(log2l, None, ) SYMBOL(log2l, None, ) SYMBOL(logb, std::, ) SYMBOL(logb, None, ) SYMBOL(logb, None, ) SYMBOL(logbf, std::, ) SYMBOL(logbf, None, ) SYMBOL(logbf, None, ) SYMBOL(logbl, std::, ) SYMBOL(logbl, None, ) SYMBOL(logbl, None, ) SYMBOL(logf, std::, ) SYMBOL(logf, None, ) SYMBOL(logf, None, ) SYMBOL(logic_error, std::, ) SYMBOL(logical_and, std::, ) SYMBOL(logical_not, std::, ) SYMBOL(logical_or, std::, ) SYMBOL(logl, std::, ) SYMBOL(logl, None, ) SYMBOL(logl, None, ) SYMBOL(lognormal_distribution, std::, ) SYMBOL(longjmp, std::, ) SYMBOL(longjmp, None, ) SYMBOL(longjmp, None, ) SYMBOL(lower_bound, std::, ) SYMBOL(lrint, std::, ) SYMBOL(lrint, None, ) SYMBOL(lrint, None, ) SYMBOL(lrintf, std::, ) SYMBOL(lrintf, None, ) SYMBOL(lrintf, None, ) SYMBOL(lrintl, std::, ) SYMBOL(lrintl, None, ) SYMBOL(lrintl, None, ) SYMBOL(lround, std::, ) SYMBOL(lround, None, ) SYMBOL(lround, None, ) SYMBOL(lroundf, std::, ) SYMBOL(lroundf, None, ) SYMBOL(lroundf, None, ) SYMBOL(lroundl, std::, ) SYMBOL(lroundl, None, ) SYMBOL(lroundl, None, ) SYMBOL(make_exception_ptr, std::, ) SYMBOL(make_format_args, std::, ) SYMBOL(make_from_tuple, std::, ) SYMBOL(make_heap, std::, ) SYMBOL(make_move_iterator, std::, ) SYMBOL(make_obj_using_allocator, std::, ) SYMBOL(make_optional, std::, ) SYMBOL(make_pair, std::, ) SYMBOL(make_reverse_iterator, std::, ) SYMBOL(make_shared, std::, ) SYMBOL(make_shared_for_overwrite, std::, ) SYMBOL(make_signed, std::, ) SYMBOL(make_signed_t, std::, ) SYMBOL(make_tuple, std::, ) SYMBOL(make_unique, std::, ) SYMBOL(make_unique_for_overwrite, std::, ) SYMBOL(make_unsigned, std::, ) SYMBOL(make_unsigned_t, std::, ) SYMBOL(make_wformat_args, std::, ) SYMBOL(malloc, std::, ) SYMBOL(malloc, None, ) SYMBOL(malloc, None, ) SYMBOL(map, std::, ) SYMBOL(mask_array, std::, ) SYMBOL(match_results, std::, ) SYMBOL(max, std::, ) SYMBOL(max_align_t, std::, ) SYMBOL(max_align_t, None, ) SYMBOL(max_align_t, None, ) SYMBOL(max_element, std::, ) SYMBOL(mblen, std::, ) SYMBOL(mblen, None, ) SYMBOL(mblen, None, ) SYMBOL(mbrlen, std::, ) SYMBOL(mbrlen, None, ) SYMBOL(mbrlen, None, ) SYMBOL(mbrtoc16, std::, ) SYMBOL(mbrtoc16, None, ) SYMBOL(mbrtoc16, None, ) SYMBOL(mbrtoc32, std::, ) SYMBOL(mbrtoc32, None, ) SYMBOL(mbrtoc32, None, ) SYMBOL(mbrtoc8, std::, ) SYMBOL(mbrtoc8, None, ) SYMBOL(mbrtoc8, None, ) SYMBOL(mbrtowc, std::, ) SYMBOL(mbrtowc, None, ) SYMBOL(mbrtowc, None, ) SYMBOL(mbsinit, std::, ) SYMBOL(mbsinit, None, ) SYMBOL(mbsinit, None, ) SYMBOL(mbsrtowcs, std::, ) SYMBOL(mbsrtowcs, None, ) SYMBOL(mbsrtowcs, None, ) SYMBOL(mbstowcs, std::, ) SYMBOL(mbstowcs, None, ) SYMBOL(mbstowcs, None, ) SYMBOL(mbtowc, std::, ) SYMBOL(mbtowc, None, ) SYMBOL(mbtowc, None, ) SYMBOL(mega, std::, ) SYMBOL(mem_fn, std::, ) SYMBOL(mem_fun, std::, ) SYMBOL(mem_fun1_ref_t, std::, ) SYMBOL(mem_fun1_t, std::, ) SYMBOL(mem_fun_ref, std::, ) SYMBOL(mem_fun_ref_t, std::, ) SYMBOL(mem_fun_t, std::, ) SYMBOL(memchr, std::, ) SYMBOL(memchr, None, ) SYMBOL(memchr, None, ) SYMBOL(memcmp, std::, ) SYMBOL(memcmp, None, ) SYMBOL(memcmp, None, ) SYMBOL(memcpy, std::, ) SYMBOL(memcpy, None, ) SYMBOL(memcpy, None, ) SYMBOL(memmove, std::, ) SYMBOL(memmove, None, ) SYMBOL(memmove, None, ) SYMBOL(memory_order, std::, ) SYMBOL(memory_order_acq_rel, std::, ) SYMBOL(memory_order_acquire, std::, ) SYMBOL(memory_order_consume, std::, ) SYMBOL(memory_order_relaxed, std::, ) SYMBOL(memory_order_release, std::, ) SYMBOL(memory_order_seq_cst, std::, ) SYMBOL(memset, std::, ) SYMBOL(memset, None, ) SYMBOL(memset, None, ) SYMBOL(merge, std::, ) SYMBOL(mergeable, std::, ) SYMBOL(mersenne_twister_engine, std::, ) SYMBOL(messages, std::, ) SYMBOL(messages_base, std::, ) SYMBOL(messages_byname, std::, ) SYMBOL(micro, std::, ) SYMBOL(midpoint, std::, ) SYMBOL(milli, std::, ) SYMBOL(min, std::, ) SYMBOL(min_element, std::, ) SYMBOL(minmax, std::, ) SYMBOL(minmax_element, std::, ) SYMBOL(minstd_rand, std::, ) SYMBOL(minstd_rand0, std::, ) SYMBOL(minus, std::, ) SYMBOL(mismatch, std::, ) SYMBOL(mktime, std::, ) SYMBOL(mktime, None, ) SYMBOL(mktime, None, ) SYMBOL(modf, std::, ) SYMBOL(modf, None, ) SYMBOL(modf, None, ) SYMBOL(modff, std::, ) SYMBOL(modff, None, ) SYMBOL(modff, None, ) SYMBOL(modfl, std::, ) SYMBOL(modfl, None, ) SYMBOL(modfl, None, ) SYMBOL(modulus, std::, ) SYMBOL(money_base, std::, ) SYMBOL(money_get, std::, ) SYMBOL(money_put, std::, ) SYMBOL(moneypunct, std::, ) SYMBOL(moneypunct_byname, std::, ) SYMBOL(monostate, std::, ) SYMBOL(movable, std::, ) SYMBOL(move_backward, std::, ) SYMBOL(move_constructible, std::, ) SYMBOL(move_if_noexcept, std::, ) SYMBOL(move_iterator, std::, ) SYMBOL(move_only_function, std::, ) SYMBOL(move_sentinel, std::, ) SYMBOL(mt19937, std::, ) SYMBOL(mt19937_64, std::, ) SYMBOL(multimap, std::, ) SYMBOL(multiplies, std::, ) SYMBOL(multiset, std::, ) SYMBOL(mutex, std::, ) SYMBOL(nan, std::, ) SYMBOL(nan, None, ) SYMBOL(nan, None, ) SYMBOL(nanf, std::, ) SYMBOL(nanf, None, ) SYMBOL(nanf, None, ) SYMBOL(nanl, std::, ) SYMBOL(nanl, None, ) SYMBOL(nanl, None, ) SYMBOL(nano, std::, ) SYMBOL(nearbyint, std::, ) SYMBOL(nearbyint, None, ) SYMBOL(nearbyint, None, ) SYMBOL(nearbyintf, std::, ) SYMBOL(nearbyintf, None, ) SYMBOL(nearbyintf, None, ) SYMBOL(nearbyintl, std::, ) SYMBOL(nearbyintl, None, ) SYMBOL(nearbyintl, None, ) SYMBOL(negate, std::, ) SYMBOL(negation, std::, ) SYMBOL(negation_v, std::, ) SYMBOL(negative_binomial_distribution, std::, ) SYMBOL(nested_exception, std::, ) SYMBOL(new_handler, std::, ) SYMBOL(next, std::, ) SYMBOL(next_permutation, std::, ) SYMBOL(nextafter, std::, ) SYMBOL(nextafter, None, ) SYMBOL(nextafter, None, ) SYMBOL(nextafterf, std::, ) SYMBOL(nextafterf, None, ) SYMBOL(nextafterf, None, ) SYMBOL(nextafterl, std::, ) SYMBOL(nextafterl, None, ) SYMBOL(nextafterl, None, ) SYMBOL(nexttoward, std::, ) SYMBOL(nexttoward, None, ) SYMBOL(nexttoward, None, ) SYMBOL(nexttowardf, std::, ) SYMBOL(nexttowardf, None, ) SYMBOL(nexttowardf, None, ) SYMBOL(nexttowardl, std::, ) SYMBOL(nexttowardl, None, ) SYMBOL(nexttowardl, None, ) SYMBOL(noboolalpha, std::, ) SYMBOL(noboolalpha, std::, ) SYMBOL(noemit_on_flush, std::, ) SYMBOL(noemit_on_flush, std::, ) SYMBOL(none_of, std::, ) SYMBOL(noop_coroutine, std::, ) SYMBOL(noop_coroutine_handle, std::, ) SYMBOL(noop_coroutine_promise, std::, ) SYMBOL(norm, std::, ) SYMBOL(normal_distribution, std::, ) SYMBOL(noshowbase, std::, ) SYMBOL(noshowbase, std::, ) SYMBOL(noshowpoint, std::, ) SYMBOL(noshowpoint, std::, ) SYMBOL(noshowpos, std::, ) SYMBOL(noshowpos, std::, ) SYMBOL(noskipws, std::, ) SYMBOL(noskipws, std::, ) SYMBOL(nostopstate, std::, ) SYMBOL(nostopstate_t, std::, ) SYMBOL(not1, std::, ) SYMBOL(not2, std::, ) SYMBOL(not_equal_to, std::, ) SYMBOL(not_fn, std::, ) SYMBOL(nothrow, std::, ) SYMBOL(nothrow_t, std::, ) SYMBOL(notify_all_at_thread_exit, std::, ) SYMBOL(nounitbuf, std::, ) SYMBOL(nounitbuf, std::, ) SYMBOL(nouppercase, std::, ) SYMBOL(nouppercase, std::, ) SYMBOL(nth_element, std::, ) SYMBOL(nullopt, std::, ) SYMBOL(nullopt_t, std::, ) SYMBOL(nullptr_t, std::, ) SYMBOL(nullptr_t, None, ) SYMBOL(nullptr_t, None, ) SYMBOL(num_get, std::, ) SYMBOL(num_put, std::, ) SYMBOL(numeric_limits, std::, ) SYMBOL(numpunct, std::, ) SYMBOL(numpunct_byname, std::, ) SYMBOL(oct, std::, ) SYMBOL(oct, std::, ) SYMBOL(ofstream, std::, ) SYMBOL(ofstream, std::, ) SYMBOL(once_flag, std::, ) SYMBOL(open_mode, std::, ) SYMBOL(open_mode, std::, ) SYMBOL(optional, std::, ) SYMBOL(ospanstream, std::, ) SYMBOL(ospanstream, std::, ) SYMBOL(ostream, std::, ) SYMBOL(ostream, std::, ) SYMBOL(ostream, std::, ) SYMBOL(ostream_iterator, std::, ) SYMBOL(ostreambuf_iterator, std::, ) SYMBOL(ostreambuf_iterator, std::, ) SYMBOL(ostringstream, std::, ) SYMBOL(ostringstream, std::, ) SYMBOL(ostrstream, std::, ) SYMBOL(osyncstream, std::, ) SYMBOL(osyncstream, std::, ) SYMBOL(out_of_range, std::, ) SYMBOL(out_ptr, std::, ) SYMBOL(out_ptr_t, std::, ) SYMBOL(output_iterator, std::, ) SYMBOL(output_iterator_tag, std::, ) SYMBOL(overflow_error, std::, ) SYMBOL(owner_less, std::, ) SYMBOL(packaged_task, std::, ) SYMBOL(pair, std::, ) SYMBOL(partial_order, std::, ) SYMBOL(partial_ordering, std::, ) SYMBOL(partial_sort, std::, ) SYMBOL(partial_sort_copy, std::, ) SYMBOL(partial_sum, std::, ) SYMBOL(partition, std::, ) SYMBOL(partition_copy, std::, ) SYMBOL(partition_point, std::, ) SYMBOL(permutable, std::, ) SYMBOL(perror, std::, ) SYMBOL(perror, None, ) SYMBOL(perror, None, ) SYMBOL(peta, std::, ) SYMBOL(pico, std::, ) SYMBOL(piecewise_constant_distribution, std::, ) SYMBOL(piecewise_construct, std::, ) SYMBOL(piecewise_construct_t, std::, ) SYMBOL(piecewise_linear_distribution, std::, ) SYMBOL(plus, std::, ) SYMBOL(pointer_safety, std::, ) SYMBOL(pointer_traits, std::, ) SYMBOL(poisson_distribution, std::, ) SYMBOL(polar, std::, ) SYMBOL(pop_heap, std::, ) SYMBOL(popcount, std::, ) SYMBOL(pow, std::, ) SYMBOL(pow, None, ) SYMBOL(pow, None, ) SYMBOL(powf, std::, ) SYMBOL(powf, None, ) SYMBOL(powf, None, ) SYMBOL(powl, std::, ) SYMBOL(powl, None, ) SYMBOL(powl, None, ) SYMBOL(predicate, std::, ) SYMBOL(preferred, std::, ) SYMBOL(prev, std::, ) SYMBOL(prev_permutation, std::, ) SYMBOL(printf, std::, ) SYMBOL(printf, None, ) SYMBOL(printf, None, ) SYMBOL(priority_queue, std::, ) SYMBOL(proj, std::, ) SYMBOL(projected, std::, ) SYMBOL(promise, std::, ) SYMBOL(ptr_fun, std::, ) SYMBOL(ptrdiff_t, std::, ) SYMBOL(ptrdiff_t, None, ) SYMBOL(ptrdiff_t, None, ) SYMBOL(push_heap, std::, ) SYMBOL(put_money, std::, ) SYMBOL(put_time, std::, ) SYMBOL(putc, std::, ) SYMBOL(putc, None, ) SYMBOL(putc, None, ) SYMBOL(putchar, std::, ) SYMBOL(putchar, None, ) SYMBOL(putchar, None, ) SYMBOL(puts, std::, ) SYMBOL(puts, None, ) SYMBOL(puts, None, ) SYMBOL(putwc, std::, ) SYMBOL(putwc, None, ) SYMBOL(putwc, None, ) SYMBOL(putwchar, std::, ) SYMBOL(putwchar, None, ) SYMBOL(putwchar, None, ) SYMBOL(qsort, std::, ) SYMBOL(qsort, None, ) SYMBOL(qsort, None, ) SYMBOL(queue, std::, ) SYMBOL(quick_exit, std::, ) SYMBOL(quick_exit, None, ) SYMBOL(quick_exit, None, ) SYMBOL(quoted, std::, ) SYMBOL(raise, std::, ) SYMBOL(raise, None, ) SYMBOL(raise, None, ) SYMBOL(rand, std::, ) SYMBOL(rand, None, ) SYMBOL(rand, None, ) SYMBOL(random_access_iterator, std::, ) SYMBOL(random_access_iterator_tag, std::, ) SYMBOL(random_device, std::, ) SYMBOL(random_shuffle, std::, ) SYMBOL(range_error, std::, ) SYMBOL(rank, std::, ) SYMBOL(rank_v, std::, ) SYMBOL(ranlux24, std::, ) SYMBOL(ranlux24_base, std::, ) SYMBOL(ranlux48, std::, ) SYMBOL(ranlux48_base, std::, ) SYMBOL(ratio, std::, ) SYMBOL(ratio_add, std::, ) SYMBOL(ratio_divide, std::, ) SYMBOL(ratio_equal, std::, ) SYMBOL(ratio_equal_v, std::, ) SYMBOL(ratio_greater, std::, ) SYMBOL(ratio_greater_equal, std::, ) SYMBOL(ratio_greater_equal_v, std::, ) SYMBOL(ratio_greater_v, std::, ) SYMBOL(ratio_less, std::, ) SYMBOL(ratio_less_equal, std::, ) SYMBOL(ratio_less_equal_v, std::, ) SYMBOL(ratio_less_v, std::, ) SYMBOL(ratio_multiply, std::, ) SYMBOL(ratio_not_equal, std::, ) SYMBOL(ratio_not_equal_v, std::, ) SYMBOL(ratio_subtract, std::, ) SYMBOL(raw_storage_iterator, std::, ) SYMBOL(real, std::, ) SYMBOL(realloc, std::, ) SYMBOL(realloc, None, ) SYMBOL(realloc, None, ) SYMBOL(recursive_mutex, std::, ) SYMBOL(recursive_timed_mutex, std::, ) SYMBOL(reduce, std::, ) SYMBOL(ref, std::, ) SYMBOL(reference_wrapper, std::, ) SYMBOL(regex, std::, ) SYMBOL(regex_error, std::, ) SYMBOL(regex_iterator, std::, ) SYMBOL(regex_match, std::, ) SYMBOL(regex_replace, std::, ) SYMBOL(regex_search, std::, ) SYMBOL(regex_token_iterator, std::, ) SYMBOL(regex_traits, std::, ) SYMBOL(regular, std::, ) SYMBOL(regular_invocable, std::, ) SYMBOL(reinterpret_pointer_cast, std::, ) SYMBOL(relation, std::, ) SYMBOL(remainder, std::, ) SYMBOL(remainder, None, ) SYMBOL(remainder, None, ) SYMBOL(remainderf, std::, ) SYMBOL(remainderf, None, ) SYMBOL(remainderf, None, ) SYMBOL(remainderl, std::, ) SYMBOL(remainderl, None, ) SYMBOL(remainderl, None, ) SYMBOL(remove_all_extents, std::, ) SYMBOL(remove_all_extents_t, std::, ) SYMBOL(remove_const, std::, ) SYMBOL(remove_const_t, std::, ) SYMBOL(remove_copy, std::, ) SYMBOL(remove_copy_if, std::, ) SYMBOL(remove_cv, std::, ) SYMBOL(remove_cv_t, std::, ) SYMBOL(remove_cvref, std::, ) SYMBOL(remove_cvref_t, std::, ) SYMBOL(remove_extent, std::, ) SYMBOL(remove_extent_t, std::, ) SYMBOL(remove_if, std::, ) SYMBOL(remove_pointer, std::, ) SYMBOL(remove_pointer_t, std::, ) SYMBOL(remove_reference, std::, ) SYMBOL(remove_reference_t, std::, ) SYMBOL(remove_volatile, std::, ) SYMBOL(remove_volatile_t, std::, ) SYMBOL(remquo, std::, ) SYMBOL(remquo, None, ) SYMBOL(remquo, None, ) SYMBOL(remquof, std::, ) SYMBOL(remquof, None, ) SYMBOL(remquof, None, ) SYMBOL(remquol, std::, ) SYMBOL(remquol, None, ) SYMBOL(remquol, None, ) SYMBOL(rename, std::, ) SYMBOL(rename, None, ) SYMBOL(rename, None, ) SYMBOL(replace, std::, ) SYMBOL(replace_copy, std::, ) SYMBOL(replace_copy_if, std::, ) SYMBOL(replace_if, std::, ) SYMBOL(resetiosflags, std::, ) SYMBOL(result_of, std::, ) SYMBOL(result_of_t, std::, ) SYMBOL(rethrow_exception, std::, ) SYMBOL(rethrow_if_nested, std::, ) SYMBOL(return_temporary_buffer, std::, ) SYMBOL(reverse, std::, ) SYMBOL(reverse_copy, std::, ) SYMBOL(reverse_iterator, std::, ) SYMBOL(rewind, std::, ) SYMBOL(rewind, None, ) SYMBOL(rewind, None, ) SYMBOL(riemann_zeta, std::, ) SYMBOL(riemann_zetaf, std::, ) SYMBOL(riemann_zetal, std::, ) SYMBOL(right, std::, ) SYMBOL(right, std::, ) SYMBOL(rint, std::, ) SYMBOL(rint, None, ) SYMBOL(rint, None, ) SYMBOL(rintf, std::, ) SYMBOL(rintf, None, ) SYMBOL(rintf, None, ) SYMBOL(rintl, std::, ) SYMBOL(rintl, None, ) SYMBOL(rintl, None, ) SYMBOL(rotate, std::, ) SYMBOL(rotate_copy, std::, ) SYMBOL(rotl, std::, ) SYMBOL(rotr, std::, ) SYMBOL(round, std::, ) SYMBOL(round, None, ) SYMBOL(round, None, ) SYMBOL(round_indeterminate, std::, ) SYMBOL(round_to_nearest, std::, ) SYMBOL(round_toward_infinity, std::, ) SYMBOL(round_toward_neg_infinity, std::, ) SYMBOL(round_toward_zero, std::, ) SYMBOL(roundf, std::, ) SYMBOL(roundf, None, ) SYMBOL(roundf, None, ) SYMBOL(roundl, std::, ) SYMBOL(roundl, None, ) SYMBOL(roundl, None, ) SYMBOL(runtime_error, std::, ) SYMBOL(same_as, std::, ) SYMBOL(sample, std::, ) SYMBOL(scalbln, std::, ) SYMBOL(scalbln, None, ) SYMBOL(scalbln, None, ) SYMBOL(scalblnf, std::, ) SYMBOL(scalblnf, None, ) SYMBOL(scalblnf, None, ) SYMBOL(scalblnl, std::, ) SYMBOL(scalblnl, None, ) SYMBOL(scalblnl, None, ) SYMBOL(scalbn, std::, ) SYMBOL(scalbn, None, ) SYMBOL(scalbn, None, ) SYMBOL(scalbnf, std::, ) SYMBOL(scalbnf, None, ) SYMBOL(scalbnf, None, ) SYMBOL(scalbnl, std::, ) SYMBOL(scalbnl, None, ) SYMBOL(scalbnl, None, ) SYMBOL(scanf, std::, ) SYMBOL(scanf, None, ) SYMBOL(scanf, None, ) SYMBOL(scientific, std::, ) SYMBOL(scientific, std::, ) SYMBOL(scoped_allocator_adaptor, std::, ) SYMBOL(scoped_lock, std::, ) SYMBOL(search, std::, ) SYMBOL(search_n, std::, ) SYMBOL(seed_seq, std::, ) SYMBOL(seek_dir, std::, ) SYMBOL(seek_dir, std::, ) SYMBOL(semiregular, std::, ) SYMBOL(sentinel_for, std::, ) SYMBOL(set, std::, ) SYMBOL(set_difference, std::, ) SYMBOL(set_intersection, std::, ) SYMBOL(set_new_handler, std::, ) SYMBOL(set_symmetric_difference, std::, ) SYMBOL(set_terminate, std::, ) SYMBOL(set_unexpected, std::, ) SYMBOL(set_union, std::, ) SYMBOL(setbase, std::, ) SYMBOL(setbuf, std::, ) SYMBOL(setbuf, None, ) SYMBOL(setbuf, None, ) SYMBOL(setfill, std::, ) SYMBOL(setiosflags, std::, ) SYMBOL(setlocale, std::, ) SYMBOL(setlocale, None, ) SYMBOL(setlocale, None, ) SYMBOL(setprecision, std::, ) SYMBOL(setvbuf, std::, ) SYMBOL(setvbuf, None, ) SYMBOL(setvbuf, None, ) SYMBOL(setw, std::, ) SYMBOL(shared_future, std::, ) SYMBOL(shared_lock, std::, ) SYMBOL(shared_mutex, std::, ) SYMBOL(shared_ptr, std::, ) SYMBOL(shared_timed_mutex, std::, ) SYMBOL(shift_left, std::, ) SYMBOL(shift_right, std::, ) SYMBOL(showbase, std::, ) SYMBOL(showbase, std::, ) SYMBOL(showpoint, std::, ) SYMBOL(showpoint, std::, ) SYMBOL(showpos, std::, ) SYMBOL(showpos, std::, ) SYMBOL(shuffle, std::, ) SYMBOL(shuffle_order_engine, std::, ) SYMBOL(sig_atomic_t, std::, ) SYMBOL(sig_atomic_t, None, ) SYMBOL(sig_atomic_t, None, ) SYMBOL(signal, std::, ) SYMBOL(signal, None, ) SYMBOL(signal, None, ) SYMBOL(signbit, std::, ) SYMBOL(signbit, None, ) SYMBOL(signbit, None, ) SYMBOL(signed_integral, std::, ) SYMBOL(sin, std::, ) SYMBOL(sin, None, ) SYMBOL(sin, None, ) SYMBOL(sinf, std::, ) SYMBOL(sinf, None, ) SYMBOL(sinf, None, ) SYMBOL(sinh, std::, ) SYMBOL(sinh, None, ) SYMBOL(sinh, None, ) SYMBOL(sinhf, std::, ) SYMBOL(sinhf, None, ) SYMBOL(sinhf, None, ) SYMBOL(sinhl, std::, ) SYMBOL(sinhl, None, ) SYMBOL(sinhl, None, ) SYMBOL(sinl, std::, ) SYMBOL(sinl, None, ) SYMBOL(sinl, None, ) SYMBOL(sized_sentinel_for, std::, ) SYMBOL(skipws, std::, ) SYMBOL(skipws, std::, ) SYMBOL(slice, std::, ) SYMBOL(slice_array, std::, ) SYMBOL(smatch, std::, ) SYMBOL(snprintf, std::, ) SYMBOL(snprintf, None, ) SYMBOL(snprintf, None, ) SYMBOL(sort, std::, ) SYMBOL(sort_heap, std::, ) SYMBOL(sortable, std::, ) SYMBOL(source_location, std::, ) SYMBOL(span, std::, ) SYMBOL(spanbuf, std::, ) SYMBOL(spanbuf, std::, ) SYMBOL(spanstream, std::, ) SYMBOL(spanstream, std::, ) SYMBOL(sph_bessel, std::, ) SYMBOL(sph_bessel, None, ) SYMBOL(sph_bessel, None, ) SYMBOL(sph_besself, std::, ) SYMBOL(sph_besself, None, ) SYMBOL(sph_besself, None, ) SYMBOL(sph_bessell, std::, ) SYMBOL(sph_bessell, None, ) SYMBOL(sph_bessell, None, ) SYMBOL(sph_legendre, std::, ) SYMBOL(sph_legendref, std::, ) SYMBOL(sph_legendrel, std::, ) SYMBOL(sph_neumann, std::, ) SYMBOL(sph_neumannf, std::, ) SYMBOL(sph_neumannl, std::, ) SYMBOL(sprintf, std::, ) SYMBOL(sprintf, None, ) SYMBOL(sprintf, None, ) SYMBOL(sqrt, std::, ) SYMBOL(sqrt, None, ) SYMBOL(sqrt, None, ) SYMBOL(sqrtf, std::, ) SYMBOL(sqrtf, None, ) SYMBOL(sqrtf, None, ) SYMBOL(sqrtl, std::, ) SYMBOL(sqrtl, None, ) SYMBOL(sqrtl, None, ) SYMBOL(srand, std::, ) SYMBOL(srand, None, ) SYMBOL(srand, None, ) SYMBOL(sregex_iterator, std::, ) SYMBOL(sregex_token_iterator, std::, ) SYMBOL(sscanf, std::, ) SYMBOL(sscanf, None, ) SYMBOL(sscanf, None, ) SYMBOL(ssub_match, std::, ) SYMBOL(stable_partition, std::, ) SYMBOL(stable_sort, std::, ) SYMBOL(stack, std::, ) SYMBOL(stacktrace, std::, ) SYMBOL(stacktrace_entry, std::, ) SYMBOL(static_pointer_cast, std::, ) SYMBOL(stod, std::, ) SYMBOL(stof, std::, ) SYMBOL(stoi, std::, ) SYMBOL(stol, std::, ) SYMBOL(stold, std::, ) SYMBOL(stoll, std::, ) SYMBOL(stop_callback, std::, ) SYMBOL(stop_source, std::, ) SYMBOL(stop_token, std::, ) SYMBOL(stoul, std::, ) SYMBOL(stoull, std::, ) SYMBOL(strcat, std::, ) SYMBOL(strcat, None, ) SYMBOL(strcat, None, ) SYMBOL(strchr, std::, ) SYMBOL(strchr, None, ) SYMBOL(strchr, None, ) SYMBOL(strcmp, std::, ) SYMBOL(strcmp, None, ) SYMBOL(strcmp, None, ) SYMBOL(strcoll, std::, ) SYMBOL(strcoll, None, ) SYMBOL(strcoll, None, ) SYMBOL(strcpy, std::, ) SYMBOL(strcpy, None, ) SYMBOL(strcpy, None, ) SYMBOL(strcspn, std::, ) SYMBOL(strcspn, None, ) SYMBOL(strcspn, None, ) SYMBOL(streambuf, std::, ) SYMBOL(streambuf, std::, ) SYMBOL(streambuf, std::, ) SYMBOL(streamoff, std::, ) SYMBOL(streamoff, std::, ) SYMBOL(streampos, std::, ) SYMBOL(streampos, std::, ) SYMBOL(streamsize, std::, ) SYMBOL(streamsize, std::, ) SYMBOL(strerror, std::, ) SYMBOL(strerror, None, ) SYMBOL(strerror, None, ) SYMBOL(strftime, std::, ) SYMBOL(strftime, None, ) SYMBOL(strftime, None, ) SYMBOL(strict, std::, ) SYMBOL(strict_weak_order, std::, ) SYMBOL(string, std::, ) SYMBOL(string_view, std::, ) SYMBOL(stringbuf, std::, ) SYMBOL(stringbuf, std::, ) SYMBOL(stringstream, std::, ) SYMBOL(stringstream, std::, ) SYMBOL(strlen, std::, ) SYMBOL(strlen, None, ) SYMBOL(strlen, None, ) SYMBOL(strncat, std::, ) SYMBOL(strncat, None, ) SYMBOL(strncat, None, ) SYMBOL(strncmp, std::, ) SYMBOL(strncmp, None, ) SYMBOL(strncmp, None, ) SYMBOL(strncpy, std::, ) SYMBOL(strncpy, None, ) SYMBOL(strncpy, None, ) SYMBOL(strong_order, std::, ) SYMBOL(strong_ordering, std::, ) SYMBOL(strpbrk, std::, ) SYMBOL(strpbrk, None, ) SYMBOL(strpbrk, None, ) SYMBOL(strrchr, std::, ) SYMBOL(strrchr, None, ) SYMBOL(strrchr, None, ) SYMBOL(strspn, std::, ) SYMBOL(strspn, None, ) SYMBOL(strspn, None, ) SYMBOL(strstr, std::, ) SYMBOL(strstr, None, ) SYMBOL(strstr, None, ) SYMBOL(strstream, std::, ) SYMBOL(strstreambuf, std::, ) SYMBOL(strtod, std::, ) SYMBOL(strtod, None, ) SYMBOL(strtod, None, ) SYMBOL(strtof, std::, ) SYMBOL(strtof, None, ) SYMBOL(strtof, None, ) SYMBOL(strtoimax, std::, ) SYMBOL(strtoimax, None, ) SYMBOL(strtoimax, None, ) SYMBOL(strtok, std::, ) SYMBOL(strtok, None, ) SYMBOL(strtok, None, ) SYMBOL(strtol, std::, ) SYMBOL(strtol, None, ) SYMBOL(strtol, None, ) SYMBOL(strtold, std::, ) SYMBOL(strtold, None, ) SYMBOL(strtold, None, ) SYMBOL(strtoll, std::, ) SYMBOL(strtoll, None, ) SYMBOL(strtoll, None, ) SYMBOL(strtoul, std::, ) SYMBOL(strtoul, None, ) SYMBOL(strtoul, None, ) SYMBOL(strtoull, std::, ) SYMBOL(strtoull, None, ) SYMBOL(strtoull, None, ) SYMBOL(strtoumax, std::, ) SYMBOL(strtoumax, None, ) SYMBOL(strtoumax, None, ) SYMBOL(strxfrm, std::, ) SYMBOL(strxfrm, None, ) SYMBOL(strxfrm, None, ) SYMBOL(student_t_distribution, std::, ) SYMBOL(sub_match, std::, ) SYMBOL(subtract_with_carry_engine, std::, ) SYMBOL(suspend_always, std::, ) SYMBOL(suspend_never, std::, ) SYMBOL(swap_ranges, std::, ) SYMBOL(swappable, std::, ) SYMBOL(swappable_with, std::, ) SYMBOL(swprintf, std::, ) SYMBOL(swprintf, None, ) SYMBOL(swprintf, None, ) SYMBOL(swscanf, std::, ) SYMBOL(swscanf, None, ) SYMBOL(swscanf, None, ) SYMBOL(syncbuf, std::, ) SYMBOL(syncbuf, std::, ) SYMBOL(system, std::, ) SYMBOL(system, None, ) SYMBOL(system, None, ) SYMBOL(system_category, std::, ) SYMBOL(system_error, std::, ) SYMBOL(tan, std::, ) SYMBOL(tan, None, ) SYMBOL(tan, None, ) SYMBOL(tanf, std::, ) SYMBOL(tanf, None, ) SYMBOL(tanf, None, ) SYMBOL(tanh, std::, ) SYMBOL(tanh, None, ) SYMBOL(tanh, None, ) SYMBOL(tanhf, std::, ) SYMBOL(tanhf, None, ) SYMBOL(tanhf, None, ) SYMBOL(tanhl, std::, ) SYMBOL(tanhl, None, ) SYMBOL(tanhl, None, ) SYMBOL(tanl, std::, ) SYMBOL(tanl, None, ) SYMBOL(tanl, None, ) SYMBOL(tera, std::, ) SYMBOL(terminate, std::, ) SYMBOL(terminate_handler, std::, ) SYMBOL(tgamma, std::, ) SYMBOL(tgamma, None, ) SYMBOL(tgamma, None, ) SYMBOL(tgammaf, std::, ) SYMBOL(tgammaf, None, ) SYMBOL(tgammaf, None, ) SYMBOL(tgammal, std::, ) SYMBOL(tgammal, None, ) SYMBOL(tgammal, None, ) SYMBOL(thread, std::, ) SYMBOL(three_way_comparable, std::, ) SYMBOL(three_way_comparable_with, std::, ) SYMBOL(throw_with_nested, std::, ) SYMBOL(tie, std::, ) SYMBOL(time, std::, ) SYMBOL(time, None, ) SYMBOL(time, None, ) SYMBOL(time_base, std::, ) SYMBOL(time_get, std::, ) SYMBOL(time_get_byname, std::, ) SYMBOL(time_put, std::, ) SYMBOL(time_put_byname, std::, ) SYMBOL(time_t, std::, ) SYMBOL(time_t, None, ) SYMBOL(time_t, None, ) SYMBOL(timed_mutex, std::, ) SYMBOL(timespec, std::, ) SYMBOL(timespec, None, ) SYMBOL(timespec, None, ) SYMBOL(timespec_get, std::, ) SYMBOL(timespec_get, None, ) SYMBOL(timespec_get, None, ) SYMBOL(tm, std::, ) SYMBOL(tm, None, ) SYMBOL(tm, None, ) SYMBOL(tmpfile, std::, ) SYMBOL(tmpfile, None, ) SYMBOL(tmpfile, None, ) SYMBOL(tmpnam, std::, ) SYMBOL(tmpnam, None, ) SYMBOL(tmpnam, None, ) SYMBOL(to_address, std::, ) SYMBOL(to_array, std::, ) SYMBOL(to_chars, std::, ) SYMBOL(to_chars_result, std::, ) SYMBOL(to_integer, std::, ) SYMBOL(to_integer, None, ) SYMBOL(to_integer, None, ) SYMBOL(to_string, std::, ) SYMBOL(to_underlying, std::, ) SYMBOL(to_wstring, std::, ) SYMBOL(tolower, std::, ) SYMBOL(tolower, None, ) SYMBOL(tolower, None, ) SYMBOL(totally_ordered, std::, ) SYMBOL(totally_ordered_with, std::, ) SYMBOL(toupper, std::, ) SYMBOL(toupper, None, ) SYMBOL(toupper, None, ) SYMBOL(towctrans, std::, ) SYMBOL(towctrans, None, ) SYMBOL(towctrans, None, ) SYMBOL(towlower, std::, ) SYMBOL(towlower, None, ) SYMBOL(towlower, None, ) SYMBOL(towupper, std::, ) SYMBOL(towupper, None, ) SYMBOL(towupper, None, ) SYMBOL(transform, std::, ) SYMBOL(transform_exclusive_scan, std::, ) SYMBOL(transform_inclusive_scan, std::, ) SYMBOL(transform_reduce, std::, ) SYMBOL(true_type, std::, ) SYMBOL(trunc, std::, ) SYMBOL(trunc, None, ) SYMBOL(trunc, None, ) SYMBOL(truncf, std::, ) SYMBOL(truncf, None, ) SYMBOL(truncf, None, ) SYMBOL(truncl, std::, ) SYMBOL(truncl, None, ) SYMBOL(truncl, None, ) SYMBOL(try_lock, std::, ) SYMBOL(try_to_lock, std::, ) SYMBOL(try_to_lock_t, std::, ) SYMBOL(tuple, std::, ) SYMBOL(tuple_cat, std::, ) SYMBOL(tuple_element_t, std::, ) SYMBOL(tuple_size_v, std::, ) SYMBOL(type_identity, std::, ) SYMBOL(type_identity_t, std::, ) SYMBOL(type_index, std::, ) SYMBOL(type_info, std::, ) SYMBOL(u16streampos, std::, ) SYMBOL(u16streampos, std::, ) SYMBOL(u16string, std::, ) SYMBOL(u16string_view, std::, ) SYMBOL(u32streampos, std::, ) SYMBOL(u32streampos, std::, ) SYMBOL(u32string, std::, ) SYMBOL(u32string_view, std::, ) SYMBOL(u8streampos, std::, ) SYMBOL(u8streampos, std::, ) SYMBOL(u8string, std::, ) SYMBOL(u8string_view, std::, ) SYMBOL(uint16_t, std::, ) SYMBOL(uint16_t, None, ) SYMBOL(uint16_t, None, ) SYMBOL(uint32_t, std::, ) SYMBOL(uint32_t, None, ) SYMBOL(uint32_t, None, ) SYMBOL(uint64_t, std::, ) SYMBOL(uint64_t, None, ) SYMBOL(uint64_t, None, ) SYMBOL(uint8_t, std::, ) SYMBOL(uint8_t, None, ) SYMBOL(uint8_t, None, ) SYMBOL(uint_fast16_t, std::, ) SYMBOL(uint_fast16_t, None, ) SYMBOL(uint_fast16_t, None, ) SYMBOL(uint_fast32_t, std::, ) SYMBOL(uint_fast32_t, None, ) SYMBOL(uint_fast32_t, None, ) SYMBOL(uint_fast64_t, std::, ) SYMBOL(uint_fast64_t, None, ) SYMBOL(uint_fast64_t, None, ) SYMBOL(uint_fast8_t, std::, ) SYMBOL(uint_fast8_t, None, ) SYMBOL(uint_fast8_t, None, ) SYMBOL(uint_least16_t, std::, ) SYMBOL(uint_least16_t, None, ) SYMBOL(uint_least16_t, None, ) SYMBOL(uint_least32_t, std::, ) SYMBOL(uint_least32_t, None, ) SYMBOL(uint_least32_t, None, ) SYMBOL(uint_least64_t, std::, ) SYMBOL(uint_least64_t, None, ) SYMBOL(uint_least64_t, None, ) SYMBOL(uint_least8_t, std::, ) SYMBOL(uint_least8_t, None, ) SYMBOL(uint_least8_t, None, ) SYMBOL(uintmax_t, std::, ) SYMBOL(uintmax_t, None, ) SYMBOL(uintmax_t, None, ) SYMBOL(uintptr_t, std::, ) SYMBOL(uintptr_t, None, ) SYMBOL(uintptr_t, None, ) SYMBOL(unary_function, std::, ) SYMBOL(unary_negate, std::, ) SYMBOL(uncaught_exception, std::, ) SYMBOL(uncaught_exceptions, std::, ) SYMBOL(undeclare_no_pointers, std::, ) SYMBOL(undeclare_reachable, std::, ) SYMBOL(underflow_error, std::, ) SYMBOL(underlying_type, std::, ) SYMBOL(underlying_type_t, std::, ) SYMBOL(unexpected_handler, std::, ) SYMBOL(ungetc, std::, ) SYMBOL(ungetc, None, ) SYMBOL(ungetc, None, ) SYMBOL(ungetwc, std::, ) SYMBOL(ungetwc, None, ) SYMBOL(ungetwc, None, ) SYMBOL(uniform_int_distribution, std::, ) SYMBOL(uniform_random_bit_generator, std::, ) SYMBOL(uniform_real_distribution, std::, ) SYMBOL(uninitialized_construct_using_allocator, std::, ) SYMBOL(uninitialized_copy, std::, ) SYMBOL(uninitialized_copy_n, std::, ) SYMBOL(uninitialized_default_construct, std::, ) SYMBOL(uninitialized_default_construct_n, std::, ) SYMBOL(uninitialized_fill, std::, ) SYMBOL(uninitialized_fill_n, std::, ) SYMBOL(uninitialized_move, std::, ) SYMBOL(uninitialized_move_n, std::, ) SYMBOL(uninitialized_value_construct, std::, ) SYMBOL(uninitialized_value_construct_n, std::, ) SYMBOL(unique, std::, ) SYMBOL(unique_copy, std::, ) SYMBOL(unique_lock, std::, ) SYMBOL(unique_ptr, std::, ) SYMBOL(unitbuf, std::, ) SYMBOL(unitbuf, std::, ) SYMBOL(unordered_map, std::, ) SYMBOL(unordered_multimap, std::, ) SYMBOL(unordered_multiset, std::, ) SYMBOL(unordered_set, std::, ) SYMBOL(unreachable, std::, ) SYMBOL(unreachable_sentinel, std::, ) SYMBOL(unreachable_sentinel_t, std::, ) SYMBOL(unsigned_integral, std::, ) SYMBOL(upper_bound, std::, ) SYMBOL(uppercase, std::, ) SYMBOL(uppercase, std::, ) SYMBOL(use_facet, std::, ) SYMBOL(uses_allocator, std::, ) SYMBOL(uses_allocator_construction_args, std::, ) SYMBOL(uses_allocator_v, std::, ) SYMBOL(va_list, std::, ) SYMBOL(va_list, None, ) SYMBOL(va_list, None, ) SYMBOL(valarray, std::, ) SYMBOL(variant, std::, ) SYMBOL(variant_alternative, std::, ) SYMBOL(variant_alternative_t, std::, ) SYMBOL(variant_npos, std::, ) SYMBOL(variant_size, std::, ) SYMBOL(variant_size_v, std::, ) SYMBOL(vector, std::, ) SYMBOL(vformat, std::, ) SYMBOL(vformat_to, std::, ) SYMBOL(vfprintf, std::, ) SYMBOL(vfprintf, None, ) SYMBOL(vfprintf, None, ) SYMBOL(vfscanf, std::, ) SYMBOL(vfscanf, None, ) SYMBOL(vfscanf, None, ) SYMBOL(vfwprintf, std::, ) SYMBOL(vfwprintf, None, ) SYMBOL(vfwprintf, None, ) SYMBOL(vfwscanf, std::, ) SYMBOL(vfwscanf, None, ) SYMBOL(vfwscanf, None, ) SYMBOL(visit, std::, ) SYMBOL(visit_format_arg, std::, ) SYMBOL(void_t, std::, ) SYMBOL(vprintf, std::, ) SYMBOL(vprintf, None, ) SYMBOL(vprintf, None, ) SYMBOL(vscanf, std::, ) SYMBOL(vscanf, None, ) SYMBOL(vscanf, None, ) SYMBOL(vsnprintf, std::, ) SYMBOL(vsnprintf, None, ) SYMBOL(vsnprintf, None, ) SYMBOL(vsprintf, std::, ) SYMBOL(vsprintf, None, ) SYMBOL(vsprintf, None, ) SYMBOL(vsscanf, std::, ) SYMBOL(vsscanf, None, ) SYMBOL(vsscanf, None, ) SYMBOL(vswprintf, std::, ) SYMBOL(vswprintf, None, ) SYMBOL(vswprintf, None, ) SYMBOL(vswscanf, std::, ) SYMBOL(vswscanf, None, ) SYMBOL(vswscanf, None, ) SYMBOL(vwprintf, std::, ) SYMBOL(vwprintf, None, ) SYMBOL(vwprintf, None, ) SYMBOL(vwscanf, std::, ) SYMBOL(vwscanf, None, ) SYMBOL(vwscanf, None, ) SYMBOL(wbuffer_convert, std::, ) SYMBOL(wcerr, std::, ) SYMBOL(wcin, std::, ) SYMBOL(wclog, std::, ) SYMBOL(wcmatch, std::, ) SYMBOL(wcout, std::, ) SYMBOL(wcregex_iterator, std::, ) SYMBOL(wcregex_token_iterator, std::, ) SYMBOL(wcrtomb, std::, ) SYMBOL(wcrtomb, None, ) SYMBOL(wcrtomb, None, ) SYMBOL(wcscat, std::, ) SYMBOL(wcscat, None, ) SYMBOL(wcscat, None, ) SYMBOL(wcschr, std::, ) SYMBOL(wcschr, None, ) SYMBOL(wcschr, None, ) SYMBOL(wcscmp, std::, ) SYMBOL(wcscmp, None, ) SYMBOL(wcscmp, None, ) SYMBOL(wcscoll, std::, ) SYMBOL(wcscoll, None, ) SYMBOL(wcscoll, None, ) SYMBOL(wcscpy, std::, ) SYMBOL(wcscpy, None, ) SYMBOL(wcscpy, None, ) SYMBOL(wcscspn, std::, ) SYMBOL(wcscspn, None, ) SYMBOL(wcscspn, None, ) SYMBOL(wcsftime, std::, ) SYMBOL(wcsftime, None, ) SYMBOL(wcsftime, None, ) SYMBOL(wcslen, std::, ) SYMBOL(wcslen, None, ) SYMBOL(wcslen, None, ) SYMBOL(wcsncat, std::, ) SYMBOL(wcsncat, None, ) SYMBOL(wcsncat, None, ) SYMBOL(wcsncmp, std::, ) SYMBOL(wcsncmp, None, ) SYMBOL(wcsncmp, None, ) SYMBOL(wcsncpy, std::, ) SYMBOL(wcsncpy, None, ) SYMBOL(wcsncpy, None, ) SYMBOL(wcspbrk, std::, ) SYMBOL(wcspbrk, None, ) SYMBOL(wcspbrk, None, ) SYMBOL(wcsrchr, std::, ) SYMBOL(wcsrchr, None, ) SYMBOL(wcsrchr, None, ) SYMBOL(wcsrtombs, std::, ) SYMBOL(wcsrtombs, None, ) SYMBOL(wcsrtombs, None, ) SYMBOL(wcsspn, std::, ) SYMBOL(wcsspn, None, ) SYMBOL(wcsspn, None, ) SYMBOL(wcsstr, std::, ) SYMBOL(wcsstr, None, ) SYMBOL(wcsstr, None, ) SYMBOL(wcstod, std::, ) SYMBOL(wcstod, None, ) SYMBOL(wcstod, None, ) SYMBOL(wcstof, std::, ) SYMBOL(wcstof, None, ) SYMBOL(wcstof, None, ) SYMBOL(wcstoimax, std::, ) SYMBOL(wcstoimax, None, ) SYMBOL(wcstoimax, None, ) SYMBOL(wcstok, std::, ) SYMBOL(wcstok, None, ) SYMBOL(wcstok, None, ) SYMBOL(wcstol, std::, ) SYMBOL(wcstol, None, ) SYMBOL(wcstol, None, ) SYMBOL(wcstold, std::, ) SYMBOL(wcstold, None, ) SYMBOL(wcstold, None, ) SYMBOL(wcstoll, std::, ) SYMBOL(wcstoll, None, ) SYMBOL(wcstoll, None, ) SYMBOL(wcstombs, std::, ) SYMBOL(wcstombs, None, ) SYMBOL(wcstombs, None, ) SYMBOL(wcstoul, std::, ) SYMBOL(wcstoul, None, ) SYMBOL(wcstoul, None, ) SYMBOL(wcstoull, std::, ) SYMBOL(wcstoull, None, ) SYMBOL(wcstoull, None, ) SYMBOL(wcstoumax, std::, ) SYMBOL(wcstoumax, None, ) SYMBOL(wcstoumax, None, ) SYMBOL(wcsub_match, std::, ) SYMBOL(wcsxfrm, std::, ) SYMBOL(wcsxfrm, None, ) SYMBOL(wcsxfrm, None, ) SYMBOL(wctob, std::, ) SYMBOL(wctob, None, ) SYMBOL(wctob, None, ) SYMBOL(wctomb, std::, ) SYMBOL(wctomb, None, ) SYMBOL(wctomb, None, ) SYMBOL(wctrans, std::, ) SYMBOL(wctrans, None, ) SYMBOL(wctrans, None, ) SYMBOL(wctrans_t, std::, ) SYMBOL(wctrans_t, None, ) SYMBOL(wctrans_t, None, ) SYMBOL(wctype, std::, ) SYMBOL(wctype, None, ) SYMBOL(wctype, None, ) SYMBOL(wctype_t, std::, ) SYMBOL(wctype_t, None, ) SYMBOL(wctype_t, None, ) SYMBOL(weak_order, std::, ) SYMBOL(weak_ordering, std::, ) SYMBOL(weak_ptr, std::, ) SYMBOL(weakly_incrementable, std::, ) SYMBOL(weibull_distribution, std::, ) SYMBOL(wfilebuf, std::, ) SYMBOL(wfilebuf, std::, ) SYMBOL(wfilebuf, std::, ) SYMBOL(wformat_args, std::, ) SYMBOL(wformat_context, std::, ) SYMBOL(wformat_parse_context, std::, ) SYMBOL(wfstream, std::, ) SYMBOL(wfstream, std::, ) SYMBOL(wifstream, std::, ) SYMBOL(wifstream, std::, ) SYMBOL(wios, std::, ) SYMBOL(wios, std::, ) SYMBOL(wios, std::, ) SYMBOL(wiostream, std::, ) SYMBOL(wiostream, std::, ) SYMBOL(wiostream, std::, ) SYMBOL(wispanstream, std::, ) SYMBOL(wispanstream, std::, ) SYMBOL(wistream, std::, ) SYMBOL(wistream, std::, ) SYMBOL(wistream, std::, ) SYMBOL(wistringstream, std::, ) SYMBOL(wistringstream, std::, ) SYMBOL(wmemchr, std::, ) SYMBOL(wmemchr, None, ) SYMBOL(wmemchr, None, ) SYMBOL(wmemcmp, std::, ) SYMBOL(wmemcmp, None, ) SYMBOL(wmemcmp, None, ) SYMBOL(wmemcpy, std::, ) SYMBOL(wmemcpy, None, ) SYMBOL(wmemcpy, None, ) SYMBOL(wmemmove, std::, ) SYMBOL(wmemmove, None, ) SYMBOL(wmemmove, None, ) SYMBOL(wmemset, std::, ) SYMBOL(wmemset, None, ) SYMBOL(wmemset, None, ) SYMBOL(wofstream, std::, ) SYMBOL(wofstream, std::, ) SYMBOL(wospanstream, std::, ) SYMBOL(wospanstream, std::, ) SYMBOL(wostream, std::, ) SYMBOL(wostream, std::, ) SYMBOL(wostream, std::, ) SYMBOL(wostringstream, std::, ) SYMBOL(wostringstream, std::, ) SYMBOL(wosyncstream, std::, ) SYMBOL(wosyncstream, std::, ) SYMBOL(wprintf, std::, ) SYMBOL(wprintf, None, ) SYMBOL(wprintf, None, ) SYMBOL(wregex, std::, ) SYMBOL(ws, std::, ) SYMBOL(ws, std::, ) SYMBOL(wscanf, std::, ) SYMBOL(wscanf, None, ) SYMBOL(wscanf, None, ) SYMBOL(wsmatch, std::, ) SYMBOL(wspanbuf, std::, ) SYMBOL(wspanbuf, std::, ) SYMBOL(wspanstream, std::, ) SYMBOL(wspanstream, std::, ) SYMBOL(wsregex_iterator, std::, ) SYMBOL(wsregex_token_iterator, std::, ) SYMBOL(wssub_match, std::, ) SYMBOL(wstreambuf, std::, ) SYMBOL(wstreambuf, std::, ) SYMBOL(wstreambuf, std::, ) SYMBOL(wstreampos, std::, ) SYMBOL(wstreampos, std::, ) SYMBOL(wstring, std::, ) SYMBOL(wstring_convert, std::, ) SYMBOL(wstring_view, std::, ) SYMBOL(wstringbuf, std::, ) SYMBOL(wstringbuf, std::, ) SYMBOL(wstringstream, std::, ) SYMBOL(wstringstream, std::, ) SYMBOL(wsyncbuf, std::, ) SYMBOL(wsyncbuf, std::, ) SYMBOL(yocto, std::, ) SYMBOL(yotta, std::, ) SYMBOL(zepto, std::, ) SYMBOL(zetta, std::, ) SYMBOL(April, std::chrono::, ) SYMBOL(August, std::chrono::, ) SYMBOL(December, std::chrono::, ) SYMBOL(February, std::chrono::, ) SYMBOL(Friday, std::chrono::, ) SYMBOL(January, std::chrono::, ) SYMBOL(July, std::chrono::, ) SYMBOL(June, std::chrono::, ) SYMBOL(March, std::chrono::, ) SYMBOL(May, std::chrono::, ) SYMBOL(Monday, std::chrono::, ) SYMBOL(November, std::chrono::, ) SYMBOL(October, std::chrono::, ) SYMBOL(Saturday, std::chrono::, ) SYMBOL(September, std::chrono::, ) SYMBOL(Sunday, std::chrono::, ) SYMBOL(Thursday, std::chrono::, ) SYMBOL(Tuesday, std::chrono::, ) SYMBOL(Wednesday, std::chrono::, ) SYMBOL(abs, std::chrono::, ) SYMBOL(ambiguous_local_time, std::chrono::, ) SYMBOL(choose, std::chrono::, ) SYMBOL(clock_cast, std::chrono::, ) SYMBOL(clock_time_conversion, std::chrono::, ) SYMBOL(current_zone, std::chrono::, ) SYMBOL(day, std::chrono::, ) SYMBOL(duration, std::chrono::, ) SYMBOL(duration_cast, std::chrono::, ) SYMBOL(duration_values, std::chrono::, ) SYMBOL(file_clock, std::chrono::, ) SYMBOL(file_seconds, std::chrono::, ) SYMBOL(file_time, std::chrono::, ) SYMBOL(get_leap_second_info, std::chrono::, ) SYMBOL(gps_clock, std::chrono::, ) SYMBOL(gps_seconds, std::chrono::, ) SYMBOL(gps_time, std::chrono::, ) SYMBOL(hh_mm_ss, std::chrono::, ) SYMBOL(high_resolution_clock, std::chrono::, ) SYMBOL(hours, std::chrono::, ) SYMBOL(is_am, std::chrono::, ) SYMBOL(is_clock, std::chrono::, ) SYMBOL(is_clock_v, std::chrono::, ) SYMBOL(is_pm, std::chrono::, ) SYMBOL(last, std::chrono::, ) SYMBOL(last_spec, std::chrono::, ) SYMBOL(leap_second, std::chrono::, ) SYMBOL(leap_second_info, std::chrono::, ) SYMBOL(local_info, std::chrono::, ) SYMBOL(local_seconds, std::chrono::, ) SYMBOL(local_t, std::chrono::, ) SYMBOL(local_time, std::chrono::, ) SYMBOL(local_time_format, std::chrono::, ) SYMBOL(locate_zone, std::chrono::, ) SYMBOL(make12, std::chrono::, ) SYMBOL(make24, std::chrono::, ) SYMBOL(microseconds, std::chrono::, ) SYMBOL(milliseconds, std::chrono::, ) SYMBOL(minutes, std::chrono::, ) SYMBOL(month, std::chrono::, ) SYMBOL(month_day, std::chrono::, ) SYMBOL(month_day_last, std::chrono::, ) SYMBOL(month_weekday, std::chrono::, ) SYMBOL(month_weekday_last, std::chrono::, ) SYMBOL(nanoseconds, std::chrono::, ) SYMBOL(nonexistent_local_time, std::chrono::, ) SYMBOL(parse, std::chrono::, ) SYMBOL(seconds, std::chrono::, ) SYMBOL(steady_clock, std::chrono::, ) SYMBOL(sys_days, std::chrono::, ) SYMBOL(sys_info, std::chrono::, ) SYMBOL(sys_seconds, std::chrono::, ) SYMBOL(sys_time, std::chrono::, ) SYMBOL(system_clock, std::chrono::, ) SYMBOL(tai_clock, std::chrono::, ) SYMBOL(tai_seconds, std::chrono::, ) SYMBOL(tai_time, std::chrono::, ) SYMBOL(time_point, std::chrono::, ) SYMBOL(time_point_cast, std::chrono::, ) SYMBOL(time_zone, std::chrono::, ) SYMBOL(time_zone_link, std::chrono::, ) SYMBOL(treat_as_floating_point, std::chrono::, ) SYMBOL(treat_as_floating_point_v, std::chrono::, ) SYMBOL(tzdb, std::chrono::, ) SYMBOL(tzdb_list, std::chrono::, ) SYMBOL(utc_clock, std::chrono::, ) SYMBOL(utc_seconds, std::chrono::, ) SYMBOL(utc_time, std::chrono::, ) SYMBOL(weekday, std::chrono::, ) SYMBOL(weekday_indexed, std::chrono::, ) SYMBOL(weekday_last, std::chrono::, ) SYMBOL(year, std::chrono::, ) SYMBOL(year_month, std::chrono::, ) SYMBOL(year_month_day, std::chrono::, ) SYMBOL(year_month_day_last, std::chrono::, ) SYMBOL(year_month_weekday, std::chrono::, ) SYMBOL(year_month_weekday_last, std::chrono::, ) SYMBOL(zoned_seconds, std::chrono::, ) SYMBOL(zoned_time, std::chrono::, ) SYMBOL(zoned_traits, std::chrono::, ) SYMBOL(par, std::execution::, ) SYMBOL(par_unseq, std::execution::, ) SYMBOL(parallel_policy, std::execution::, ) SYMBOL(parallel_unsequenced_policy, std::execution::, ) SYMBOL(seq, std::execution::, ) SYMBOL(sequenced_policy, std::execution::, ) SYMBOL(unseq, std::execution::, ) SYMBOL(unsequenced_policy, std::execution::, ) SYMBOL(absolute, std::filesystem::, ) SYMBOL(canonical, std::filesystem::, ) SYMBOL(copy, std::filesystem::, ) SYMBOL(copy_file, std::filesystem::, ) SYMBOL(copy_options, std::filesystem::, ) SYMBOL(copy_symlink, std::filesystem::, ) SYMBOL(create_directories, std::filesystem::, ) SYMBOL(create_directory, std::filesystem::, ) SYMBOL(create_directory_symlink, std::filesystem::, ) SYMBOL(create_hard_link, std::filesystem::, ) SYMBOL(create_symlink, std::filesystem::, ) SYMBOL(current_path, std::filesystem::, ) SYMBOL(directory_entry, std::filesystem::, ) SYMBOL(directory_iterator, std::filesystem::, ) SYMBOL(directory_options, std::filesystem::, ) SYMBOL(equivalent, std::filesystem::, ) SYMBOL(exists, std::filesystem::, ) SYMBOL(file_size, std::filesystem::, ) SYMBOL(file_status, std::filesystem::, ) SYMBOL(file_time_type, std::filesystem::, ) SYMBOL(file_type, std::filesystem::, ) SYMBOL(filesystem_error, std::filesystem::, ) SYMBOL(hard_link_count, std::filesystem::, ) SYMBOL(hash_value, std::filesystem::, ) SYMBOL(is_block_file, std::filesystem::, ) SYMBOL(is_character_file, std::filesystem::, ) SYMBOL(is_directory, std::filesystem::, ) SYMBOL(is_empty, std::filesystem::, ) SYMBOL(is_fifo, std::filesystem::, ) SYMBOL(is_other, std::filesystem::, ) SYMBOL(is_regular_file, std::filesystem::, ) SYMBOL(is_socket, std::filesystem::, ) SYMBOL(is_symlink, std::filesystem::, ) SYMBOL(last_write_time, std::filesystem::, ) SYMBOL(path, std::filesystem::, ) SYMBOL(perm_options, std::filesystem::, ) SYMBOL(permissions, std::filesystem::, ) SYMBOL(perms, std::filesystem::, ) SYMBOL(proximate, std::filesystem::, ) SYMBOL(read_symlink, std::filesystem::, ) SYMBOL(recursive_directory_iterator, std::filesystem::, ) SYMBOL(relative, std::filesystem::, ) SYMBOL(remove, std::filesystem::, ) SYMBOL(remove_all, std::filesystem::, ) SYMBOL(rename, std::filesystem::, ) SYMBOL(resize_file, std::filesystem::, ) SYMBOL(space, std::filesystem::, ) SYMBOL(space_info, std::filesystem::, ) SYMBOL(status, std::filesystem::, ) SYMBOL(status_known, std::filesystem::, ) SYMBOL(symlink_status, std::filesystem::, ) SYMBOL(temp_directory_path, std::filesystem::, ) SYMBOL(u8path, std::filesystem::, ) SYMBOL(weakly_canonical, std::filesystem::, ) SYMBOL(e, std::numbers::, ) SYMBOL(e_v, std::numbers::, ) SYMBOL(egamma, std::numbers::, ) SYMBOL(egamma_v, std::numbers::, ) SYMBOL(inv_pi, std::numbers::, ) SYMBOL(inv_pi_v, std::numbers::, ) SYMBOL(inv_sqrt3, std::numbers::, ) SYMBOL(inv_sqrt3_v, std::numbers::, ) SYMBOL(inv_sqrtpi, std::numbers::, ) SYMBOL(inv_sqrtpi_v, std::numbers::, ) SYMBOL(ln10, std::numbers::, ) SYMBOL(ln10_v, std::numbers::, ) SYMBOL(ln2, std::numbers::, ) SYMBOL(ln2_v, std::numbers::, ) SYMBOL(log10e, std::numbers::, ) SYMBOL(log10e_v, std::numbers::, ) SYMBOL(log2e, std::numbers::, ) SYMBOL(log2e_v, std::numbers::, ) SYMBOL(phi, std::numbers::, ) SYMBOL(phi_v, std::numbers::, ) SYMBOL(pi, std::numbers::, ) SYMBOL(pi_v, std::numbers::, ) SYMBOL(sqrt2, std::numbers::, ) SYMBOL(sqrt2_v, std::numbers::, ) SYMBOL(sqrt3, std::numbers::, ) SYMBOL(sqrt3_v, std::numbers::, ) SYMBOL(basic_string, std::pmr::, ) SYMBOL(cmatch, std::pmr::, ) SYMBOL(deque, std::pmr::, ) SYMBOL(forward_list, std::pmr::, ) SYMBOL(get_default_resource, std::pmr::, ) SYMBOL(list, std::pmr::, ) SYMBOL(map, std::pmr::, ) SYMBOL(match_results, std::pmr::, ) SYMBOL(memory_resource, std::pmr::, ) SYMBOL(monotonic_buffer_resource, std::pmr::, ) SYMBOL(multimap, std::pmr::, ) SYMBOL(multiset, std::pmr::, ) SYMBOL(new_delete_resource, std::pmr::, ) SYMBOL(null_memory_resource, std::pmr::, ) SYMBOL(polymorphic_allocator, std::pmr::, ) SYMBOL(pool_options, std::pmr::, ) SYMBOL(set, std::pmr::, ) SYMBOL(set_default_resource, std::pmr::, ) SYMBOL(smatch, std::pmr::, ) SYMBOL(stacktrace, std::pmr::, ) SYMBOL(string, std::pmr::, ) SYMBOL(synchronized_pool_resource, std::pmr::, ) SYMBOL(u16string, std::pmr::, ) SYMBOL(u32string, std::pmr::, ) SYMBOL(u8string, std::pmr::, ) SYMBOL(unordered_map, std::pmr::, ) SYMBOL(unordered_multimap, std::pmr::, ) SYMBOL(unordered_multiset, std::pmr::, ) SYMBOL(unordered_set, std::pmr::, ) SYMBOL(unsynchronized_pool_resource, std::pmr::, ) SYMBOL(vector, std::pmr::, ) SYMBOL(wcmatch, std::pmr::, ) SYMBOL(wsmatch, std::pmr::, ) SYMBOL(wstring, std::pmr::, ) SYMBOL(adjacent_find, std::ranges::, ) SYMBOL(advance, std::ranges::, ) SYMBOL(all_of, std::ranges::, ) SYMBOL(any_of, std::ranges::, ) SYMBOL(as_const_view, std::ranges::, ) SYMBOL(as_rvalue_view, std::ranges::, ) SYMBOL(basic_istream_view, std::ranges::, ) SYMBOL(begin, std::ranges::, ) SYMBOL(bidirectional_range, std::ranges::, ) SYMBOL(binary_transform_result, std::ranges::, ) SYMBOL(borrowed_iterator_t, std::ranges::, ) SYMBOL(borrowed_range, std::ranges::, ) SYMBOL(borrowed_subrange_t, std::ranges::, ) SYMBOL(cbegin, std::ranges::, ) SYMBOL(cdata, std::ranges::, ) SYMBOL(cend, std::ranges::, ) SYMBOL(clamp, std::ranges::, ) SYMBOL(common_range, std::ranges::, ) SYMBOL(common_view, std::ranges::, ) SYMBOL(const_iterator_t, std::ranges::, ) SYMBOL(constant_range, std::ranges::, ) SYMBOL(construct_at, std::ranges::, ) SYMBOL(contains, std::ranges::, ) SYMBOL(contains_subrange, std::ranges::, ) SYMBOL(contiguous_range, std::ranges::, ) SYMBOL(copy, std::ranges::, ) SYMBOL(copy_backward, std::ranges::, ) SYMBOL(copy_backward_result, std::ranges::, ) SYMBOL(copy_if, std::ranges::, ) SYMBOL(copy_if_result, std::ranges::, ) SYMBOL(copy_n, std::ranges::, ) SYMBOL(copy_n_result, std::ranges::, ) SYMBOL(copy_result, std::ranges::, ) SYMBOL(count, std::ranges::, ) SYMBOL(count_if, std::ranges::, ) SYMBOL(crbegin, std::ranges::, ) SYMBOL(crend, std::ranges::, ) SYMBOL(dangling, std::ranges::, ) SYMBOL(data, std::ranges::, ) SYMBOL(destroy, std::ranges::, ) SYMBOL(destroy_at, std::ranges::, ) SYMBOL(destroy_n, std::ranges::, ) SYMBOL(disable_sized_range, std::ranges::, ) SYMBOL(distance, std::ranges::, ) SYMBOL(drop_view, std::ranges::, ) SYMBOL(drop_while_view, std::ranges::, ) SYMBOL(elements_view, std::ranges::, ) SYMBOL(empty, std::ranges::, ) SYMBOL(empty_view, std::ranges::, ) SYMBOL(enable_borrowed_range, std::ranges::, ) SYMBOL(enable_view, std::ranges::, ) SYMBOL(end, std::ranges::, ) SYMBOL(ends_with, std::ranges::, ) SYMBOL(equal, std::ranges::, ) SYMBOL(equal_to, std::ranges::, ) SYMBOL(fill, std::ranges::, ) SYMBOL(fill_n, std::ranges::, ) SYMBOL(filter_view, std::ranges::, ) SYMBOL(find, std::ranges::, ) SYMBOL(find_end, std::ranges::, ) SYMBOL(find_first_of, std::ranges::, ) SYMBOL(find_if, std::ranges::, ) SYMBOL(find_if_not, std::ranges::, ) SYMBOL(find_last, std::ranges::, ) SYMBOL(find_last_if, std::ranges::, ) SYMBOL(find_last_if_not, std::ranges::, ) SYMBOL(for_each, std::ranges::, ) SYMBOL(for_each_n, std::ranges::, ) SYMBOL(for_each_n_result, std::ranges::, ) SYMBOL(for_each_result, std::ranges::, ) SYMBOL(forward_range, std::ranges::, ) SYMBOL(generate, std::ranges::, ) SYMBOL(generate_n, std::ranges::, ) SYMBOL(greater, std::ranges::, ) SYMBOL(greater_equal, std::ranges::, ) SYMBOL(in_found_result, std::ranges::, ) SYMBOL(in_fun_result, std::ranges::, ) SYMBOL(in_in_out_result, std::ranges::, ) SYMBOL(in_in_result, std::ranges::, ) SYMBOL(in_out_out_result, std::ranges::, ) SYMBOL(in_out_result, std::ranges::, ) SYMBOL(in_value_result, std::ranges::, ) SYMBOL(includes, std::ranges::, ) SYMBOL(inplace_merge, std::ranges::, ) SYMBOL(input_range, std::ranges::, ) SYMBOL(iota, std::ranges::, ) SYMBOL(iota_result, std::ranges::, ) SYMBOL(iota_view, std::ranges::, ) SYMBOL(is_heap, std::ranges::, ) SYMBOL(is_heap_until, std::ranges::, ) SYMBOL(is_partitioned, std::ranges::, ) SYMBOL(is_permutation, std::ranges::, ) SYMBOL(is_sorted, std::ranges::, ) SYMBOL(is_sorted_until, std::ranges::, ) SYMBOL(istream_view, std::ranges::, ) SYMBOL(iter_move, std::ranges::, ) SYMBOL(iter_swap, std::ranges::, ) SYMBOL(iterator_t, std::ranges::, ) SYMBOL(join_view, std::ranges::, ) SYMBOL(join_with_view, std::ranges::, ) SYMBOL(keys_view, std::ranges::, ) SYMBOL(lazy_split_view, std::ranges::, ) SYMBOL(less, std::ranges::, ) SYMBOL(less_equal, std::ranges::, ) SYMBOL(lexicographical_compare, std::ranges::, ) SYMBOL(make_heap, std::ranges::, ) SYMBOL(max, std::ranges::, ) SYMBOL(max_element, std::ranges::, ) SYMBOL(merge, std::ranges::, ) SYMBOL(merge_result, std::ranges::, ) SYMBOL(min, std::ranges::, ) SYMBOL(min_element, std::ranges::, ) SYMBOL(min_max_result, std::ranges::, ) SYMBOL(minmax, std::ranges::, ) SYMBOL(minmax_element, std::ranges::, ) SYMBOL(minmax_element_result, std::ranges::, ) SYMBOL(minmax_result, std::ranges::, ) SYMBOL(mismatch, std::ranges::, ) SYMBOL(mismatch_result, std::ranges::, ) SYMBOL(move, std::ranges::, ) SYMBOL(move_backward, std::ranges::, ) SYMBOL(move_backward_result, std::ranges::, ) SYMBOL(move_result, std::ranges::, ) SYMBOL(next, std::ranges::, ) SYMBOL(next_permutation, std::ranges::, ) SYMBOL(next_permutation_result, std::ranges::, ) SYMBOL(none_of, std::ranges::, ) SYMBOL(not_equal_to, std::ranges::, ) SYMBOL(nth_element, std::ranges::, ) SYMBOL(out_value_result, std::ranges::, ) SYMBOL(output_range, std::ranges::, ) SYMBOL(owning_view, std::ranges::, ) SYMBOL(partial_sort, std::ranges::, ) SYMBOL(partial_sort_copy, std::ranges::, ) SYMBOL(partial_sort_copy_result, std::ranges::, ) SYMBOL(partition, std::ranges::, ) SYMBOL(partition_copy, std::ranges::, ) SYMBOL(partition_copy_result, std::ranges::, ) SYMBOL(partition_point, std::ranges::, ) SYMBOL(pop_heap, std::ranges::, ) SYMBOL(prev, std::ranges::, ) SYMBOL(prev_permutation, std::ranges::, ) SYMBOL(prev_permutation_result, std::ranges::, ) SYMBOL(push_heap, std::ranges::, ) SYMBOL(random_access_range, std::ranges::, ) SYMBOL(range, std::ranges::, ) SYMBOL(range_const_reference_t, std::ranges::, ) SYMBOL(range_difference_t, std::ranges::, ) SYMBOL(range_reference_t, std::ranges::, ) SYMBOL(range_rvalue_reference_t, std::ranges::, ) SYMBOL(range_size_t, std::ranges::, ) SYMBOL(range_value_t, std::ranges::, ) SYMBOL(rbegin, std::ranges::, ) SYMBOL(ref_view, std::ranges::, ) SYMBOL(remove, std::ranges::, ) SYMBOL(remove_copy, std::ranges::, ) SYMBOL(remove_copy_if, std::ranges::, ) SYMBOL(remove_copy_if_result, std::ranges::, ) SYMBOL(remove_copy_result, std::ranges::, ) SYMBOL(remove_if, std::ranges::, ) SYMBOL(rend, std::ranges::, ) SYMBOL(replace, std::ranges::, ) SYMBOL(replace_copy, std::ranges::, ) SYMBOL(replace_copy_if, std::ranges::, ) SYMBOL(replace_copy_if_result, std::ranges::, ) SYMBOL(replace_copy_result, std::ranges::, ) SYMBOL(replace_if, std::ranges::, ) SYMBOL(reverse, std::ranges::, ) SYMBOL(reverse_copy, std::ranges::, ) SYMBOL(reverse_copy_result, std::ranges::, ) SYMBOL(reverse_view, std::ranges::, ) SYMBOL(rotate, std::ranges::, ) SYMBOL(rotate_copy, std::ranges::, ) SYMBOL(rotate_copy_result, std::ranges::, ) SYMBOL(sample, std::ranges::, ) SYMBOL(search, std::ranges::, ) SYMBOL(search_n, std::ranges::, ) SYMBOL(sentinel_t, std::ranges::, ) SYMBOL(set_difference, std::ranges::, ) SYMBOL(set_difference_result, std::ranges::, ) SYMBOL(set_intersection, std::ranges::, ) SYMBOL(set_intersection_result, std::ranges::, ) SYMBOL(set_symmetric_difference, std::ranges::, ) SYMBOL(set_symmetric_difference_result, std::ranges::, ) SYMBOL(set_union, std::ranges::, ) SYMBOL(set_union_result, std::ranges::, ) SYMBOL(shift_left, std::ranges::, ) SYMBOL(shift_right, std::ranges::, ) SYMBOL(shuffle, std::ranges::, ) SYMBOL(single_view, std::ranges::, ) SYMBOL(size, std::ranges::, ) SYMBOL(sized_range, std::ranges::, ) SYMBOL(sort, std::ranges::, ) SYMBOL(sort_heap, std::ranges::, ) SYMBOL(split_view, std::ranges::, ) SYMBOL(ssize, std::ranges::, ) SYMBOL(stable_partition, std::ranges::, ) SYMBOL(stable_sort, std::ranges::, ) SYMBOL(starts_with, std::ranges::, ) SYMBOL(subrange, std::ranges::, ) SYMBOL(subrange_kind, std::ranges::, ) SYMBOL(swap, std::ranges::, ) SYMBOL(swap_ranges, std::ranges::, ) SYMBOL(swap_ranges_result, std::ranges::, ) SYMBOL(take_view, std::ranges::, ) SYMBOL(take_while_view, std::ranges::, ) SYMBOL(to, std::ranges::, ) SYMBOL(transform, std::ranges::, ) SYMBOL(transform_view, std::ranges::, ) SYMBOL(unary_transform_result, std::ranges::, ) SYMBOL(uninitialized_copy, std::ranges::, ) SYMBOL(uninitialized_copy_n, std::ranges::, ) SYMBOL(uninitialized_copy_n_result, std::ranges::, ) SYMBOL(uninitialized_copy_result, std::ranges::, ) SYMBOL(uninitialized_default_construct, std::ranges::, ) SYMBOL(uninitialized_default_construct_n, std::ranges::, ) SYMBOL(uninitialized_fill, std::ranges::, ) SYMBOL(uninitialized_fill_n, std::ranges::, ) SYMBOL(uninitialized_move, std::ranges::, ) SYMBOL(uninitialized_move_n, std::ranges::, ) SYMBOL(uninitialized_move_n_result, std::ranges::, ) SYMBOL(uninitialized_move_result, std::ranges::, ) SYMBOL(uninitialized_value_construct, std::ranges::, ) SYMBOL(uninitialized_value_construct_n, std::ranges::, ) SYMBOL(unique, std::ranges::, ) SYMBOL(unique_copy, std::ranges::, ) SYMBOL(unique_copy_result, std::ranges::, ) SYMBOL(values_view, std::ranges::, ) SYMBOL(view, std::ranges::, ) SYMBOL(view_base, std::ranges::, ) SYMBOL(view_interface, std::ranges::, ) SYMBOL(viewable_range, std::ranges::, ) SYMBOL(wistream_view, std::ranges::, ) SYMBOL(zip_transform_view, std::ranges::, ) SYMBOL(zip_view, std::ranges::, ) +SYMBOL(all, std::ranges::views::, ) +SYMBOL(all_t, std::ranges::views::, ) +SYMBOL(as_const, std::ranges::views::, ) +SYMBOL(as_rvalue, std::ranges::views::, ) +SYMBOL(common, std::ranges::views::, ) +SYMBOL(counted, std::ranges::views::, ) +SYMBOL(drop, std::ranges::views::, ) +SYMBOL(drop_while, std::ranges::views::, ) +SYMBOL(elements, std::ranges::views::, ) +SYMBOL(empty, std::ranges::views::, ) +SYMBOL(filter, std::ranges::views::, ) +SYMBOL(iota, std::ranges::views::, ) +SYMBOL(istream, std::ranges::views::, ) +SYMBOL(istream, std::ranges::views::, ) +SYMBOL(join, std::ranges::views::, ) +SYMBOL(join_with, std::ranges::views::, ) +SYMBOL(keys, std::ranges::views::, ) +SYMBOL(lazy_split, std::ranges::views::, ) +SYMBOL(reverse, std::ranges::views::, ) +SYMBOL(single, std::ranges::views::, ) +SYMBOL(split, std::ranges::views::, ) +SYMBOL(take, std::ranges::views::, ) +SYMBOL(take_while, std::ranges::views::, ) +SYMBOL(transform, std::ranges::views::, ) +SYMBOL(values, std::ranges::views::, ) +SYMBOL(zip, std::ranges::views::, ) +SYMBOL(zip_transform, std::ranges::views::, ) SYMBOL(ECMAScript, std::regex_constants::, ) SYMBOL(awk, std::regex_constants::, ) SYMBOL(basic, std::regex_constants::, ) SYMBOL(collate, std::regex_constants::, ) SYMBOL(egrep, std::regex_constants::, ) SYMBOL(error_backref, std::regex_constants::, ) SYMBOL(error_badbrace, std::regex_constants::, ) SYMBOL(error_badrepeat, std::regex_constants::, ) SYMBOL(error_brace, std::regex_constants::, ) SYMBOL(error_brack, std::regex_constants::, ) SYMBOL(error_collate, std::regex_constants::, ) SYMBOL(error_complexity, std::regex_constants::, ) SYMBOL(error_ctype, std::regex_constants::, ) SYMBOL(error_escape, std::regex_constants::, ) SYMBOL(error_paren, std::regex_constants::, ) SYMBOL(error_range, std::regex_constants::, ) SYMBOL(error_space, std::regex_constants::, ) SYMBOL(error_stack, std::regex_constants::, ) SYMBOL(error_type, std::regex_constants::, ) SYMBOL(extended, std::regex_constants::, ) SYMBOL(format_default, std::regex_constants::, ) SYMBOL(format_first_only, std::regex_constants::, ) SYMBOL(format_no_copy, std::regex_constants::, ) SYMBOL(format_sed, std::regex_constants::, ) SYMBOL(grep, std::regex_constants::, ) SYMBOL(icase, std::regex_constants::, ) SYMBOL(match_any, std::regex_constants::, ) SYMBOL(match_continuous, std::regex_constants::, ) SYMBOL(match_default, std::regex_constants::, ) SYMBOL(match_flag_type, std::regex_constants::, ) SYMBOL(match_not_bol, std::regex_constants::, ) SYMBOL(match_not_bow, std::regex_constants::, ) SYMBOL(match_not_eol, std::regex_constants::, ) SYMBOL(match_not_eow, std::regex_constants::, ) SYMBOL(match_not_null, std::regex_constants::, ) SYMBOL(match_prev_avail, std::regex_constants::, ) SYMBOL(multiline, std::regex_constants::, ) SYMBOL(nosubs, std::regex_constants::, ) SYMBOL(optimize, std::regex_constants::, ) SYMBOL(syntax_option_type, std::regex_constants::, ) SYMBOL(get_id, std::this_thread::, ) SYMBOL(sleep_for, std::this_thread::, ) SYMBOL(sleep_until, std::this_thread::, ) SYMBOL(yield, std::this_thread::, ) +SYMBOL(all, std::views::, ) +SYMBOL(all_t, std::views::, ) +SYMBOL(as_const, std::views::, ) +SYMBOL(as_rvalue, std::views::, ) +SYMBOL(common, std::views::, ) +SYMBOL(counted, std::views::, ) +SYMBOL(drop, std::views::, ) +SYMBOL(drop_while, std::views::, ) +SYMBOL(elements, std::views::, ) +SYMBOL(empty, std::views::, ) +SYMBOL(filter, std::views::, ) +SYMBOL(iota, std::views::, ) +SYMBOL(istream, std::views::, ) +SYMBOL(istream, std::views::, ) +SYMBOL(join, std::views::, ) +SYMBOL(join_with, std::views::, ) +SYMBOL(keys, std::views::, ) +SYMBOL(lazy_split, std::views::, ) +SYMBOL(reverse, std::views::, ) +SYMBOL(single, std::views::, ) +SYMBOL(split, std::views::, ) +SYMBOL(take, std::views::, ) +SYMBOL(take_while, std::views::, ) +SYMBOL(transform, std::views::, ) +SYMBOL(values, std::views::, ) +SYMBOL(zip, std::views::, ) +SYMBOL(zip_transform, std::views::, ) diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S index 5dc0d5320b5a..1fe18f4a4681 100644 --- a/compiler-rt/lib/builtins/aarch64/lse.S +++ b/compiler-rt/lib/builtins/aarch64/lse.S @@ -1,236 +1,270 @@ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "assembly.h" // Out-of-line LSE atomics helpers. Ported from libgcc library. // N = {1, 2, 4, 8} // M = {1, 2, 4, 8, 16} -// ORDER = {'relax', 'acq', 'rel', 'acq_rel'} +// ORDER = {'relax', 'acq', 'rel', 'acq_rel', 'sync'} // Routines implemented: // // iM __aarch64_casM_ORDER(iM expected, iM desired, iM *ptr) // iN __aarch64_swpN_ORDER(iN val, iN *ptr) // iN __aarch64_ldaddN_ORDER(iN val, iN *ptr) // iN __aarch64_ldclrN_ORDER(iN val, iN *ptr) // iN __aarch64_ldeorN_ORDER(iN val, iN *ptr) // iN __aarch64_ldsetN_ORDER(iN val, iN *ptr) // // Routines may modify temporary registers tmp0, tmp1, tmp2, // return value x0 and the flags only. #ifdef __aarch64__ #ifdef HAS_ASM_LSE .arch armv8-a+lse #else .arch armv8-a #endif #if !defined(__APPLE__) HIDDEN(__aarch64_have_lse_atomics) #else HIDDEN(___aarch64_have_lse_atomics) #endif // Generate mnemonics for -// L_cas: SIZE: 1,2,4,8,16 MODEL: 1,2,3,4 -// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8 MODEL: 1,2,3,4 +// L_cas: SIZE: 1,2,4,8,16 MODEL: 1,2,3,4,5 +// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8 MODEL: 1,2,3,4,5 #if SIZE == 1 #define S b #define UXT uxtb #define B 0x00000000 #elif SIZE == 2 #define S h #define UXT uxth #define B 0x40000000 #elif SIZE == 4 || SIZE == 8 || SIZE == 16 #define S #define UXT mov #if SIZE == 4 #define B 0x80000000 #elif SIZE == 8 #define B 0xc0000000 #endif #else #error #endif // SIZE #if MODEL == 1 #define SUFF _relax #define A #define L #define M 0x000000 #define N 0x000000 +#define BARRIER #elif MODEL == 2 #define SUFF _acq #define A a #define L #define M 0x400000 #define N 0x800000 +#define BARRIER #elif MODEL == 3 #define SUFF _rel #define A #define L l #define M 0x008000 #define N 0x400000 +#define BARRIER #elif MODEL == 4 #define SUFF _acq_rel #define A a #define L l #define M 0x408000 #define N 0xc00000 +#define BARRIER +#elif MODEL == 5 +#define SUFF _sync +#ifdef L_swp +// swp has _acq semantics. +#define A a +#define L +#define M 0x400000 +#define N 0x800000 +#else +// All other _sync functions have _seq semantics. +#define A a +#define L l +#define M 0x408000 +#define N 0xc00000 +#endif +#define BARRIER dmb ish #else #error #endif // MODEL // Define register size. #define x(N) GLUE2(x, N) #define w(N) GLUE2(w, N) #if SIZE < 8 #define s(N) w(N) #else #define s(N) x(N) #endif #define NAME(BASE) GLUE4(__aarch64_, BASE, SIZE, SUFF) +#if MODEL == 5 +// Drop A for _sync functions. +#define LDXR GLUE3(ld, xr, S) +#else #define LDXR GLUE4(ld, A, xr, S) +#endif #define STXR GLUE4(st, L, xr, S) // Define temporary registers. #define tmp0 16 #define tmp1 17 #define tmp2 15 // Macro for branch to label if no LSE available .macro JUMP_IF_NOT_LSE label #if !defined(__APPLE__) adrp x(tmp0), __aarch64_have_lse_atomics ldrb w(tmp0), [x(tmp0), :lo12:__aarch64_have_lse_atomics] #else adrp x(tmp0), ___aarch64_have_lse_atomics@page ldrb w(tmp0), [x(tmp0), ___aarch64_have_lse_atomics@pageoff] #endif cbz w(tmp0), \label .endm #ifdef L_cas DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas)) JUMP_IF_NOT_LSE 8f #if SIZE < 16 #ifdef HAS_ASM_LSE #define CAS GLUE4(cas, A, L, S) s(0), s(1), [x2] #else #define CAS .inst 0x08a07c41 + B + M #endif CAS // s(0), s(1), [x2] ret 8: UXT s(tmp0), s(0) 0: LDXR s(0), [x2] cmp s(0), s(tmp0) bne 1f STXR w(tmp1), s(1), [x2] cbnz w(tmp1), 0b 1: + BARRIER ret #else +#if MODEL == 5 +// Drop A for _sync functions. +#define LDXP GLUE2(ld, xp) +#else #define LDXP GLUE3(ld, A, xp) +#endif #define STXP GLUE3(st, L, xp) #ifdef HAS_ASM_LSE #define CASP GLUE3(casp, A, L) x0, x1, x2, x3, [x4] #else #define CASP .inst 0x48207c82 + M #endif CASP // x0, x1, x2, x3, [x4] ret 8: mov x(tmp0), x0 mov x(tmp1), x1 0: LDXP x0, x1, [x4] cmp x0, x(tmp0) ccmp x1, x(tmp1), #0, eq bne 1f STXP w(tmp2), x2, x3, [x4] cbnz w(tmp2), 0b 1: + BARRIER ret #endif END_COMPILERRT_OUTLINE_FUNCTION(NAME(cas)) #endif // L_cas #ifdef L_swp #ifdef HAS_ASM_LSE #define SWP GLUE4(swp, A, L, S) s(0), s(0), [x1] #else #define SWP .inst 0x38208020 + B + N #endif DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(swp)) JUMP_IF_NOT_LSE 8f SWP // s(0), s(0), [x1] ret 8: mov s(tmp0), s(0) 0: LDXR s(0), [x1] STXR w(tmp1), s(tmp0), [x1] cbnz w(tmp1), 0b + BARRIER ret END_COMPILERRT_OUTLINE_FUNCTION(NAME(swp)) #endif // L_swp #if defined(L_ldadd) || defined(L_ldclr) || \ defined(L_ldeor) || defined(L_ldset) #ifdef L_ldadd #define LDNM ldadd #define OP add #define OPN 0x0000 #elif defined(L_ldclr) #define LDNM ldclr #define OP bic #define OPN 0x1000 #elif defined(L_ldeor) #define LDNM ldeor #define OP eor #define OPN 0x2000 #elif defined(L_ldset) #define LDNM ldset #define OP orr #define OPN 0x3000 #else #error #endif #ifdef HAS_ASM_LSE #define LDOP GLUE4(LDNM, A, L, S) s(0), s(0), [x1] #else #define LDOP .inst 0x38200020 + OPN + B + N #endif DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(LDNM)) JUMP_IF_NOT_LSE 8f LDOP // s(0), s(0), [x1] ret 8: mov s(tmp0), s(0) 0: LDXR s(0), [x1] OP s(tmp1), s(0), s(tmp0) STXR w(tmp2), s(tmp1), [x1] cbnz w(tmp2), 0b + BARRIER ret END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM)) #endif // L_ldadd L_ldclr L_ldeor L_ldset NO_EXEC_STACK_DIRECTIVE // GNU property note for BTI and PAC GNU_PROPERTY_BTI_PAC #endif // __aarch64__ diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc index 220abb89c3be..24485900644b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc @@ -1,568 +1,576 @@ //===-- sanitizer_common_interceptors_format.inc ----------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Scanf/printf implementation for use in *Sanitizer interceptors. // Follows http://pubs.opengroup.org/onlinepubs/9699919799/functions/fscanf.html // and http://pubs.opengroup.org/onlinepubs/9699919799/functions/fprintf.html // with a few common GNU extensions. // //===----------------------------------------------------------------------===// #include static const char *parse_number(const char *p, int *out) { *out = internal_atoll(p); while (*p >= '0' && *p <= '9') ++p; return p; } static const char *maybe_parse_param_index(const char *p, int *out) { // n$ if (*p >= '0' && *p <= '9') { int number; const char *q = parse_number(p, &number); CHECK(q); if (*q == '$') { *out = number; p = q + 1; } } // Otherwise, do not change p. This will be re-parsed later as the field // width. return p; } static bool char_is_one_of(char c, const char *s) { return !!internal_strchr(s, c); } static const char *maybe_parse_length_modifier(const char *p, char ll[2]) { if (char_is_one_of(*p, "jztLq")) { ll[0] = *p; ++p; } else if (*p == 'h') { ll[0] = 'h'; ++p; if (*p == 'h') { ll[1] = 'h'; ++p; } } else if (*p == 'l') { ll[0] = 'l'; ++p; if (*p == 'l') { ll[1] = 'l'; ++p; } } return p; } // Returns true if the character is an integer conversion specifier. static bool format_is_integer_conv(char c) { return char_is_one_of(c, "diouxXn"); } // Returns true if the character is an floating point conversion specifier. static bool format_is_float_conv(char c) { return char_is_one_of(c, "aAeEfFgG"); } // Returns string output character size for string-like conversions, // or 0 if the conversion is invalid. static int format_get_char_size(char convSpecifier, const char lengthModifier[2]) { if (char_is_one_of(convSpecifier, "CS")) { return sizeof(wchar_t); } if (char_is_one_of(convSpecifier, "cs[")) { if (lengthModifier[0] == 'l' && lengthModifier[1] == '\0') return sizeof(wchar_t); else if (lengthModifier[0] == '\0') return sizeof(char); } return 0; } enum FormatStoreSize { // Store size not known in advance; can be calculated as wcslen() of the // destination buffer. FSS_WCSLEN = -2, // Store size not known in advance; can be calculated as strlen() of the // destination buffer. FSS_STRLEN = -1, // Invalid conversion specifier. FSS_INVALID = 0 }; // Returns the memory size of a format directive (if >0), or a value of // FormatStoreSize. static int format_get_value_size(char convSpecifier, const char lengthModifier[2], bool promote_float) { if (format_is_integer_conv(convSpecifier)) { switch (lengthModifier[0]) { case 'h': return lengthModifier[1] == 'h' ? sizeof(char) : sizeof(short); case 'l': return lengthModifier[1] == 'l' ? sizeof(long long) : sizeof(long); case 'q': return sizeof(long long); case 'L': return sizeof(long long); case 'j': return sizeof(INTMAX_T); case 'z': return sizeof(SIZE_T); case 't': return sizeof(PTRDIFF_T); case 0: return sizeof(int); default: return FSS_INVALID; } } if (format_is_float_conv(convSpecifier)) { switch (lengthModifier[0]) { case 'L': case 'q': return sizeof(long double); case 'l': return lengthModifier[1] == 'l' ? sizeof(long double) : sizeof(double); case 0: // Printf promotes floats to doubles but scanf does not return promote_float ? sizeof(double) : sizeof(float); default: return FSS_INVALID; } } if (convSpecifier == 'p') { if (lengthModifier[0] != 0) return FSS_INVALID; return sizeof(void *); } return FSS_INVALID; } struct ScanfDirective { int argIdx; // argument index, or -1 if not specified ("%n$") int fieldWidth; const char *begin; const char *end; bool suppressed; // suppress assignment ("*") bool allocate; // allocate space ("m") char lengthModifier[2]; char convSpecifier; bool maybeGnuMalloc; }; // Parse scanf format string. If a valid directive in encountered, it is // returned in dir. This function returns the pointer to the first // unprocessed character, or 0 in case of error. // In case of the end-of-string, a pointer to the closing \0 is returned. static const char *scanf_parse_next(const char *p, bool allowGnuMalloc, ScanfDirective *dir) { internal_memset(dir, 0, sizeof(*dir)); dir->argIdx = -1; while (*p) { if (*p != '%') { ++p; continue; } dir->begin = p; ++p; // %% if (*p == '%') { ++p; continue; } if (*p == '\0') { return nullptr; } // %n$ p = maybe_parse_param_index(p, &dir->argIdx); CHECK(p); // * if (*p == '*') { dir->suppressed = true; ++p; } // Field width if (*p >= '0' && *p <= '9') { p = parse_number(p, &dir->fieldWidth); CHECK(p); if (dir->fieldWidth <= 0) // Width if at all must be non-zero return nullptr; } // m if (*p == 'm') { dir->allocate = true; ++p; } // Length modifier. p = maybe_parse_length_modifier(p, dir->lengthModifier); // Conversion specifier. dir->convSpecifier = *p++; // Consume %[...] expression. if (dir->convSpecifier == '[') { if (*p == '^') ++p; if (*p == ']') ++p; while (*p && *p != ']') ++p; if (*p == 0) return nullptr; // unexpected end of string // Consume the closing ']'. ++p; } // This is unfortunately ambiguous between old GNU extension // of %as, %aS and %a[...] and newer POSIX %a followed by // letters s, S or [. if (allowGnuMalloc && dir->convSpecifier == 'a' && !dir->lengthModifier[0]) { if (*p == 's' || *p == 'S') { dir->maybeGnuMalloc = true; ++p; } else if (*p == '[') { // Watch for %a[h-j%d], if % appears in the // [...] range, then we need to give up, we don't know // if scanf will parse it as POSIX %a [h-j %d ] or // GNU allocation of string with range dh-j plus %. const char *q = p + 1; if (*q == '^') ++q; if (*q == ']') ++q; while (*q && *q != ']' && *q != '%') ++q; if (*q == 0 || *q == '%') return nullptr; p = q + 1; // Consume the closing ']'. dir->maybeGnuMalloc = true; } } dir->end = p; break; } return p; } static int scanf_get_value_size(ScanfDirective *dir) { if (dir->allocate) { if (!char_is_one_of(dir->convSpecifier, "cCsS[")) return FSS_INVALID; return sizeof(char *); } if (dir->maybeGnuMalloc) { if (dir->convSpecifier != 'a' || dir->lengthModifier[0]) return FSS_INVALID; // This is ambiguous, so check the smaller size of char * (if it is // a GNU extension of %as, %aS or %a[...]) and float (if it is // POSIX %a followed by s, S or [ letters). return sizeof(char *) < sizeof(float) ? sizeof(char *) : sizeof(float); } if (char_is_one_of(dir->convSpecifier, "cCsS[")) { bool needsTerminator = char_is_one_of(dir->convSpecifier, "sS["); unsigned charSize = format_get_char_size(dir->convSpecifier, dir->lengthModifier); if (charSize == 0) return FSS_INVALID; if (dir->fieldWidth == 0) { if (!needsTerminator) return charSize; return (charSize == sizeof(char)) ? FSS_STRLEN : FSS_WCSLEN; } return (dir->fieldWidth + needsTerminator) * charSize; } return format_get_value_size(dir->convSpecifier, dir->lengthModifier, false); } // Common part of *scanf interceptors. // Process format string and va_list, and report all store ranges. // Stops when "consuming" n_inputs input items. static void scanf_common(void *ctx, int n_inputs, bool allowGnuMalloc, const char *format, va_list aq) { CHECK_GT(n_inputs, 0); const char *p = format; COMMON_INTERCEPTOR_READ_RANGE(ctx, format, internal_strlen(format) + 1); while (*p) { ScanfDirective dir; p = scanf_parse_next(p, allowGnuMalloc, &dir); if (!p) break; if (dir.convSpecifier == 0) { // This can only happen at the end of the format string. CHECK_EQ(*p, 0); break; } // Here the directive is valid. Do what it says. if (dir.argIdx != -1) { // Unsupported. break; } if (dir.suppressed) continue; int size = scanf_get_value_size(&dir); if (size == FSS_INVALID) { Report("%s: WARNING: unexpected format specifier in scanf interceptor: %.*s\n", SanitizerToolName, static_cast(dir.end - dir.begin), dir.begin); break; } void *argp = va_arg(aq, void *); if (dir.convSpecifier != 'n') --n_inputs; if (n_inputs < 0) break; if (size == FSS_STRLEN) { size = internal_strlen((const char *)argp) + 1; } else if (size == FSS_WCSLEN) { // FIXME: actually use wcslen() to calculate it. size = 0; } COMMON_INTERCEPTOR_WRITE_RANGE(ctx, argp, size); - // For %ms/%mc, write the allocated output buffer as well. + // For %mc/%mC/%ms/%m[/%mS, write the allocated output buffer as well. if (dir.allocate) { - char *buf = *(char **)argp; - if (buf) - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, internal_strlen(buf) + 1); + if (char *buf = *(char **)argp) { + if (dir.convSpecifier == 'c') + size = 1; + else if (dir.convSpecifier == 'C') + size = sizeof(wchar_t); + else if (dir.convSpecifier == 'S') + size = (internal_wcslen((wchar_t *)buf) + 1) * sizeof(wchar_t); + else // 's' or '[' + size = internal_strlen(buf) + 1; + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, size); + } } } } #if SANITIZER_INTERCEPT_PRINTF struct PrintfDirective { int fieldWidth; int fieldPrecision; int argIdx; // width argument index, or -1 if not specified ("%*n$") int precisionIdx; // precision argument index, or -1 if not specified (".*n$") const char *begin; const char *end; bool starredWidth; bool starredPrecision; char lengthModifier[2]; char convSpecifier; }; static const char *maybe_parse_number(const char *p, int *out) { if (*p >= '0' && *p <= '9') p = parse_number(p, out); return p; } static const char *maybe_parse_number_or_star(const char *p, int *out, bool *star) { if (*p == '*') { *star = true; ++p; } else { *star = false; p = maybe_parse_number(p, out); } return p; } // Parse printf format string. Same as scanf_parse_next. static const char *printf_parse_next(const char *p, PrintfDirective *dir) { internal_memset(dir, 0, sizeof(*dir)); dir->argIdx = -1; dir->precisionIdx = -1; while (*p) { if (*p != '%') { ++p; continue; } dir->begin = p; ++p; // %% if (*p == '%') { ++p; continue; } if (*p == '\0') { return nullptr; } // %n$ p = maybe_parse_param_index(p, &dir->precisionIdx); CHECK(p); // Flags while (char_is_one_of(*p, "'-+ #0")) { ++p; } // Field width p = maybe_parse_number_or_star(p, &dir->fieldWidth, &dir->starredWidth); if (!p) return nullptr; // Precision if (*p == '.') { ++p; // Actual precision is optional (surprise!) p = maybe_parse_number_or_star(p, &dir->fieldPrecision, &dir->starredPrecision); if (!p) return nullptr; // m$ if (dir->starredPrecision) { p = maybe_parse_param_index(p, &dir->precisionIdx); CHECK(p); } } // Length modifier. p = maybe_parse_length_modifier(p, dir->lengthModifier); // Conversion specifier. dir->convSpecifier = *p++; dir->end = p; break; } return p; } static int printf_get_value_size(PrintfDirective *dir) { if (char_is_one_of(dir->convSpecifier, "cCsS")) { unsigned charSize = format_get_char_size(dir->convSpecifier, dir->lengthModifier); if (charSize == 0) return FSS_INVALID; if (char_is_one_of(dir->convSpecifier, "sS")) { return (charSize == sizeof(char)) ? FSS_STRLEN : FSS_WCSLEN; } return charSize; } return format_get_value_size(dir->convSpecifier, dir->lengthModifier, true); } #define SKIP_SCALAR_ARG(aq, convSpecifier, size) \ do { \ if (format_is_float_conv(convSpecifier)) { \ switch (size) { \ case 8: \ va_arg(*aq, double); \ break; \ case 12: \ va_arg(*aq, long double); \ break; \ case 16: \ va_arg(*aq, long double); \ break; \ default: \ Report("WARNING: unexpected floating-point arg size" \ " in printf interceptor: %zu\n", static_cast(size)); \ return; \ } \ } else { \ switch (size) { \ case 1: \ case 2: \ case 4: \ va_arg(*aq, u32); \ break; \ case 8: \ va_arg(*aq, u64); \ break; \ default: \ Report("WARNING: unexpected arg size" \ " in printf interceptor: %zu\n", static_cast(size)); \ return; \ } \ } \ } while (0) // Common part of *printf interceptors. // Process format string and va_list, and report all load ranges. static void printf_common(void *ctx, const char *format, va_list aq) { COMMON_INTERCEPTOR_READ_RANGE(ctx, format, internal_strlen(format) + 1); const char *p = format; while (*p) { PrintfDirective dir; p = printf_parse_next(p, &dir); if (!p) break; if (dir.convSpecifier == 0) { // This can only happen at the end of the format string. CHECK_EQ(*p, 0); break; } // Here the directive is valid. Do what it says. if (dir.argIdx != -1 || dir.precisionIdx != -1) { // Unsupported. break; } if (dir.starredWidth) { // Dynamic width SKIP_SCALAR_ARG(&aq, 'd', sizeof(int)); } if (dir.starredPrecision) { // Dynamic precision SKIP_SCALAR_ARG(&aq, 'd', sizeof(int)); } // %m does not require an argument: strlen(errno). if (dir.convSpecifier == 'm') continue; int size = printf_get_value_size(&dir); if (size == FSS_INVALID) { static int ReportedOnce; if (!ReportedOnce++) Report( "%s: WARNING: unexpected format specifier in printf " "interceptor: %.*s (reported once per process)\n", SanitizerToolName, static_cast(dir.end - dir.begin), dir.begin); break; } if (dir.convSpecifier == 'n') { void *argp = va_arg(aq, void *); COMMON_INTERCEPTOR_WRITE_RANGE(ctx, argp, size); continue; } else if (size == FSS_STRLEN) { if (void *argp = va_arg(aq, void *)) { if (dir.starredPrecision) { // FIXME: properly support starred precision for strings. size = 0; } else if (dir.fieldPrecision > 0) { // Won't read more than "precision" symbols. size = internal_strnlen((const char *)argp, dir.fieldPrecision); if (size < dir.fieldPrecision) size++; } else { // Whole string will be accessed. size = internal_strlen((const char *)argp) + 1; } COMMON_INTERCEPTOR_READ_RANGE(ctx, argp, size); } } else if (size == FSS_WCSLEN) { if (void *argp = va_arg(aq, void *)) { // FIXME: Properly support wide-character strings (via wcsrtombs). size = 0; COMMON_INTERCEPTOR_READ_RANGE(ctx, argp, size); } } else { // Skip non-pointer args SKIP_SCALAR_ARG(&aq, dir.convSpecifier, size); } } } #endif // SANITIZER_INTERCEPT_PRINTF diff --git a/libcxx/include/__config b/libcxx/include/__config index 9759d3b9e8e0..43f8a20031ff 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1,1472 +1,1486 @@ // -*- C++ -*- //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef _LIBCPP___CONFIG #define _LIBCPP___CONFIG #include <__config_site> #if defined(_MSC_VER) && !defined(__clang__) # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # define _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # endif #endif #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header #endif #if defined(__apple_build_version__) // Given AppleClang XX.Y.Z, _LIBCPP_APPLE_CLANG_VER is XXYZ (e.g. AppleClang 14.0.3 => 1403) # define _LIBCPP_COMPILER_CLANG_BASED # define _LIBCPP_APPLE_CLANG_VER (__apple_build_version__ / 10000) #elif defined(__clang__) # define _LIBCPP_COMPILER_CLANG_BASED # define _LIBCPP_CLANG_VER (__clang_major__ * 100 + __clang_minor__) #elif defined(__GNUC__) # define _LIBCPP_COMPILER_GCC #endif #ifdef __cplusplus // The attributes supported by clang are documented at https://clang.llvm.org/docs/AttributeReference.html // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM. // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is // defined to XXYYZZ. # define _LIBCPP_VERSION 170000 # define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y # define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y) // Valid C++ identifier that revs with every libc++ version. This can be used to // generate identifiers that must be unique for every released libc++ version. # define _LIBCPP_VERSIONED_IDENTIFIER _LIBCPP_CONCAT(v, _LIBCPP_VERSION) # if __STDC_HOSTED__ == 0 # define _LIBCPP_FREESTANDING # endif // NOLINTBEGIN(libcpp-cpp-version-check) # ifndef _LIBCPP_STD_VER # if __cplusplus <= 201103L # define _LIBCPP_STD_VER 11 # elif __cplusplus <= 201402L # define _LIBCPP_STD_VER 14 # elif __cplusplus <= 201703L # define _LIBCPP_STD_VER 17 # elif __cplusplus <= 202002L # define _LIBCPP_STD_VER 20 # elif __cplusplus <= 202302L # define _LIBCPP_STD_VER 23 # else // Expected release year of the next C++ standard # define _LIBCPP_STD_VER 26 # endif # endif // _LIBCPP_STD_VER // NOLINTEND(libcpp-cpp-version-check) # if defined(__ELF__) # define _LIBCPP_OBJECT_FORMAT_ELF 1 # elif defined(__MACH__) # define _LIBCPP_OBJECT_FORMAT_MACHO 1 # elif defined(_WIN32) # define _LIBCPP_OBJECT_FORMAT_COFF 1 # elif defined(__wasm__) # define _LIBCPP_OBJECT_FORMAT_WASM 1 # elif defined(_AIX) # define _LIBCPP_OBJECT_FORMAT_XCOFF 1 # else // ... add new file formats here ... # endif // ABI { # if _LIBCPP_ABI_VERSION >= 2 // Change short string representation so that string data starts at offset 0, // improving its alignment in some cases. # define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT // Fix deque iterator type in order to support incomplete types. # define _LIBCPP_ABI_INCOMPLETE_TYPES_IN_DEQUE // Fix undefined behavior in how std::list stores its linked nodes. # define _LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB // Fix undefined behavior in how __tree stores its end and parent nodes. # define _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB // Fix undefined behavior in how __hash_table stores its pointer types. # define _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB # define _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB # define _LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE // Define a key function for `bad_function_call` in the library, to centralize // its vtable and typeinfo to libc++ rather than having all other libraries // using that class define their own copies. # define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION // Override the default return value of exception::what() for // bad_function_call::what() with a string that is specific to // bad_function_call (see http://wg21.link/LWG2233). This is an ABI break // because it changes the vtable layout of bad_function_call. # define _LIBCPP_ABI_BAD_FUNCTION_CALL_GOOD_WHAT_MESSAGE // Enable optimized version of __do_get_(un)signed which avoids redundant copies. # define _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET // Give reverse_iterator one data member of type T, not two. // Also, in C++17 and later, don't derive iterator types from std::iterator. # define _LIBCPP_ABI_NO_ITERATOR_BASES // Use the smallest possible integer type to represent the index of the variant. // Previously libc++ used "unsigned int" exclusively. # define _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION // Unstable attempt to provide a more optimized std::function # define _LIBCPP_ABI_OPTIMIZED_FUNCTION // All the regex constants must be distinct and nonzero. # define _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO // Re-worked external template instantiations for std::string with a focus on // performance and fast-path inlining. # define _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION // Enable clang::trivial_abi on std::unique_ptr. # define _LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI // Enable clang::trivial_abi on std::shared_ptr and std::weak_ptr # define _LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI // std::random_device holds some state when it uses an implementation that gets // entropy from a file (see _LIBCPP_USING_DEV_RANDOM). When switching from this // implementation to another one on a platform that has already shipped // std::random_device, one needs to retain the same object layout to remain ABI // compatible. This switch removes these workarounds for platforms that don't care // about ABI compatibility. # define _LIBCPP_ABI_NO_RANDOM_DEVICE_COMPATIBILITY_LAYOUT // Don't export the legacy __basic_string_common class and its methods from the built library. # define _LIBCPP_ABI_DO_NOT_EXPORT_BASIC_STRING_COMMON // Don't export the legacy __vector_base_common class and its methods from the built library. # define _LIBCPP_ABI_DO_NOT_EXPORT_VECTOR_BASE_COMMON // According to the Standard, `bitset::operator[] const` returns bool # define _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL // Fix the implementation of CityHash used for std::hash. // This is an ABI break because `std::hash` will return a different result, // which means that hashing the same object in translation units built against // different versions of libc++ can return inconsistent results. This is especially // tricky since std::hash is used in the implementation of unordered containers. // // The incorrect implementation of CityHash has the problem that it drops some // bits on the floor. # define _LIBCPP_ABI_FIX_CITYHASH_IMPLEMENTATION // Remove the base 10 implementation of std::to_chars from the dylib. // The implementation moved to the header, but we still export the symbols from // the dylib for backwards compatibility. # define _LIBCPP_ABI_DO_NOT_EXPORT_TO_CHARS_BASE_10 # elif _LIBCPP_ABI_VERSION == 1 # if !(defined(_LIBCPP_OBJECT_FORMAT_COFF) || defined(_LIBCPP_OBJECT_FORMAT_XCOFF)) // Enable compiling copies of now inline methods into the dylib to support // applications compiled against older libraries. This is unnecessary with // COFF dllexport semantics, since dllexport forces a non-inline definition // of inline functions to be emitted anyway. Our own non-inline copy would // conflict with the dllexport-emitted copy, so we disable it. For XCOFF, // the linker will take issue with the symbols in the shared object if the // weak inline methods get visibility (such as from -fvisibility-inlines-hidden), // so disable it. # define _LIBCPP_DEPRECATED_ABI_LEGACY_LIBRARY_DEFINITIONS_FOR_INLINE_FUNCTIONS # endif // Feature macros for disabling pre ABI v1 features. All of these options // are deprecated. # if defined(__FreeBSD__) # define _LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR # endif // For XCOFF linkers, we have problems if we see a weak hidden version of a symbol // in user code (like you get with -fvisibility-inlines-hidden) and then a strong def // in the library, so we need to always rely on the library version. # if defined(_AIX) # define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION # endif # endif # if defined(_LIBCPP_BUILDING_LIBRARY) || _LIBCPP_ABI_VERSION >= 2 // Enable additional explicit instantiations of iostreams components. This // reduces the number of weak definitions generated in programs that use // iostreams by providing a single strong definition in the shared library. # define _LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 // Define a key function for `bad_function_call` in the library, to centralize // its vtable and typeinfo to libc++ rather than having all other libraries // using that class define their own copies. # define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION # endif // Changes the iterator type of select containers (see below) to a bounded iterator that keeps track of whether it's // within the bounds of the original container and asserts it on every dereference. // // ABI impact: changes the iterator type of the relevant containers. // // Supported containers: // - `span`; // - `string_view`; // - `array`. // #define _LIBCPP_ABI_BOUNDED_ITERATORS // } ABI // HARDENING { -// TODO(hardening): remove this in LLVM 18. -// This is for backward compatibility -- make enabling `_LIBCPP_ENABLE_ASSERTIONS` (which predates hardening modes) -// equivalent to setting the hardened mode. -# ifdef _LIBCPP_ENABLE_ASSERTIONS -# warning "_LIBCPP_ENABLE_ASSERTIONS is deprecated, please use _LIBCPP_ENABLE_HARDENED_MODE instead." -# if _LIBCPP_ENABLE_ASSERTIONS != 0 && _LIBCPP_ENABLE_ASSERTIONS != 1 -# error "_LIBCPP_ENABLE_ASSERTIONS must be set to 0 or 1" -# endif -# if _LIBCPP_ENABLE_ASSERTIONS -# define _LIBCPP_ENABLE_HARDENED_MODE 1 -# endif +# ifndef _LIBCPP_ENABLE_ASSERTIONS +# define _LIBCPP_ENABLE_ASSERTIONS _LIBCPP_ENABLE_ASSERTIONS_DEFAULT +# endif +# if _LIBCPP_ENABLE_ASSERTIONS != 0 && _LIBCPP_ENABLE_ASSERTIONS != 1 +# error "_LIBCPP_ENABLE_ASSERTIONS must be set to 0 or 1" # endif +// NOTE: These modes are experimental and are not stable yet in LLVM 17. Please refrain from using them and use the +// documented libc++ "safe" mode instead. +// // Enables the hardened mode which consists of all checks intended to be used in production. Hardened mode prioritizes // security-critical checks that can be done with relatively little overhead in constant time. Mutually exclusive with // `_LIBCPP_ENABLE_DEBUG_MODE`. // // #define _LIBCPP_ENABLE_HARDENED_MODE 1 // Enables the debug mode which contains all the checks from the hardened mode and additionally more expensive checks // that may affect the complexity of algorithms. The debug mode is intended to be used for testing, not in production. // Mutually exclusive with `_LIBCPP_ENABLE_HARDENED_MODE`. // // #define _LIBCPP_ENABLE_DEBUG_MODE 1 // Inside the library, assertions are categorized so they can be cherry-picked based on the chosen hardening mode. These // macros are only for internal use -- users should only pick one of the high-level hardening modes described above. // // - `_LIBCPP_ASSERT_VALID_INPUT_RANGE` -- checks that ranges (whether expressed as an iterator pair, an iterator and // a sentinel, an iterator and a count, or a `std::range`) given as input to library functions are valid: // - the sentinel is reachable from the begin iterator; // - TODO(hardening): both iterators refer to the same container. // // - `_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS` -- checks that any attempts to access a container element, whether through // the container object or through an iterator, are valid and do not attempt to go out of bounds or otherwise access // a non-existent element. For iterator checks to work, bounded iterators must be enabled in the ABI. Types like // `optional` and `function` are considered one-element containers for the purposes of this check. // // - `_LIBCPP_ASSERT_NON_OVERLAPPING_RANGES` -- for functions that take several ranges as arguments, checks that the // given ranges do not overlap. // // - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure // the containers have compatible allocators. // // - `_LIBCPP_ASSERT_INTERNAL` -- checks that internal invariants of the library hold. These assertions don't depend on // user input. // // - `_LIBCPP_ASSERT_UNCATEGORIZED` -- for assertions that haven't been properly classified yet. # ifndef _LIBCPP_ENABLE_HARDENED_MODE # define _LIBCPP_ENABLE_HARDENED_MODE _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT # endif # if _LIBCPP_ENABLE_HARDENED_MODE != 0 && _LIBCPP_ENABLE_HARDENED_MODE != 1 # error "_LIBCPP_ENABLE_HARDENED_MODE must be set to 0 or 1." # endif # ifndef _LIBCPP_ENABLE_DEBUG_MODE # define _LIBCPP_ENABLE_DEBUG_MODE _LIBCPP_ENABLE_DEBUG_MODE_DEFAULT # endif # if _LIBCPP_ENABLE_DEBUG_MODE != 0 && _LIBCPP_ENABLE_DEBUG_MODE != 1 # error "_LIBCPP_ENABLE_DEBUG_MODE must be set to 0 or 1." # endif # if _LIBCPP_ENABLE_HARDENED_MODE && _LIBCPP_ENABLE_DEBUG_MODE # error "Only one of _LIBCPP_ENABLE_HARDENED_MODE and _LIBCPP_ENABLE_DEBUG_MODE can be enabled." # endif +# if _LIBCPP_ENABLE_ASSERTIONS && (_LIBCPP_ENABLE_HARDENED_MODE || _LIBCPP_ENABLE_DEBUG_MODE) +# error \ + "_LIBCPP_ENABLE_ASSERTIONS is mutually exclusive with _LIBCPP_ENABLE_HARDENED_MODE and _LIBCPP_ENABLE_DEBUG_MODE." +# endif + // Hardened mode checks. // clang-format off # if _LIBCPP_ENABLE_HARDENED_MODE // Enabled checks. # define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message) _LIBCPP_ASSERT(expression, message) # define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message) _LIBCPP_ASSERT(expression, message) // Disabled checks. // Overlapping ranges will make algorithms produce incorrect results but don't directly lead to a security // vulnerability. # define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_INTERNAL(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message) _LIBCPP_ASSUME(expression) // Debug mode checks. # elif _LIBCPP_ENABLE_DEBUG_MODE // All checks enabled. # define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message) _LIBCPP_ASSERT(expression, message) # define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message) _LIBCPP_ASSERT(expression, message) # define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message) _LIBCPP_ASSERT(expression, message) # define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message) _LIBCPP_ASSERT(expression, message) # define _LIBCPP_ASSERT_INTERNAL(expression, message) _LIBCPP_ASSERT(expression, message) # define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message) _LIBCPP_ASSERT(expression, message) +// Safe mode checks. + +# elif _LIBCPP_ENABLE_ASSERTIONS + +// All checks enabled. +# define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message) _LIBCPP_ASSERT(expression, message) +# define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message) _LIBCPP_ASSERT(expression, message) +# define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message) _LIBCPP_ASSERT(expression, message) +# define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message) _LIBCPP_ASSERT(expression, message) +# define _LIBCPP_ASSERT_INTERNAL(expression, message) _LIBCPP_ASSERT(expression, message) +# define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message) _LIBCPP_ASSERT(expression, message) + // Disable all checks if hardening is not enabled. # else // All checks disabled. # define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_INTERNAL(expression, message) _LIBCPP_ASSUME(expression) # define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message) _LIBCPP_ASSUME(expression) # endif // _LIBCPP_ENABLE_HARDENED_MODE // clang-format on // } HARDENING # define _LIBCPP_TOSTRING2(x) #x # define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x) // NOLINTNEXTLINE(libcpp-cpp-version-check) # if __cplusplus < 201103L # define _LIBCPP_CXX03_LANG # endif # ifndef __has_attribute # define __has_attribute(__x) 0 # endif # ifndef __has_builtin # define __has_builtin(__x) 0 # endif # ifndef __has_extension # define __has_extension(__x) 0 # endif # ifndef __has_feature # define __has_feature(__x) 0 # endif # ifndef __has_cpp_attribute # define __has_cpp_attribute(__x) 0 # endif # ifndef __has_constexpr_builtin # define __has_constexpr_builtin(x) 0 # endif // '__is_identifier' returns '0' if '__x' is a reserved identifier provided by // the compiler and '1' otherwise. # ifndef __is_identifier # define __is_identifier(__x) 1 # endif # ifndef __has_declspec_attribute # define __has_declspec_attribute(__x) 0 # endif # define __has_keyword(__x) !(__is_identifier(__x)) # ifndef __has_include # define __has_include(...) 0 # endif # if !defined(_LIBCPP_COMPILER_CLANG_BASED) && __cplusplus < 201103L # error "libc++ only supports C++03 with Clang-based compilers. Please enable C++11" # endif // FIXME: ABI detection should be done via compiler builtin macros. This // is just a placeholder until Clang implements such macros. For now assume // that Windows compilers pretending to be MSVC++ target the Microsoft ABI, // and allow the user to explicitly specify the ABI to handle cases where this // heuristic falls short. # if defined(_LIBCPP_ABI_FORCE_ITANIUM) && defined(_LIBCPP_ABI_FORCE_MICROSOFT) # error "Only one of _LIBCPP_ABI_FORCE_ITANIUM and _LIBCPP_ABI_FORCE_MICROSOFT can be defined" # elif defined(_LIBCPP_ABI_FORCE_ITANIUM) # define _LIBCPP_ABI_ITANIUM # elif defined(_LIBCPP_ABI_FORCE_MICROSOFT) # define _LIBCPP_ABI_MICROSOFT # else # if defined(_WIN32) && defined(_MSC_VER) # define _LIBCPP_ABI_MICROSOFT # else # define _LIBCPP_ABI_ITANIUM # endif # endif # if defined(_LIBCPP_ABI_MICROSOFT) && !defined(_LIBCPP_NO_VCRUNTIME) # define _LIBCPP_ABI_VCRUNTIME # endif # if __has_feature(experimental_library) # ifndef _LIBCPP_ENABLE_EXPERIMENTAL # define _LIBCPP_ENABLE_EXPERIMENTAL # endif # endif // Incomplete features get their own specific disabling flags. This makes it // easier to grep for target specific flags once the feature is complete. # if !defined(_LIBCPP_ENABLE_EXPERIMENTAL) && !defined(_LIBCPP_BUILDING_LIBRARY) # define _LIBCPP_HAS_NO_INCOMPLETE_PSTL # endif # if !defined(_LIBCPP_ENABLE_EXPERIMENTAL) && !defined(_LIBCPP_BUILDING_LIBRARY) # define _LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN # endif // Need to detect which libc we're using if we're on Linux. # if defined(__linux__) # include # if defined(__GLIBC_PREREQ) # define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) # else # define _LIBCPP_GLIBC_PREREQ(a, b) 0 # endif // defined(__GLIBC_PREREQ) # endif // defined(__linux__) # if defined(__MVS__) # include // for __NATIVE_ASCII_F # endif # ifdef __LITTLE_ENDIAN__ # if __LITTLE_ENDIAN__ # define _LIBCPP_LITTLE_ENDIAN # endif // __LITTLE_ENDIAN__ # endif // __LITTLE_ENDIAN__ # ifdef __BIG_ENDIAN__ # if __BIG_ENDIAN__ # define _LIBCPP_BIG_ENDIAN # endif // __BIG_ENDIAN__ # endif // __BIG_ENDIAN__ # ifdef __BYTE_ORDER__ # if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ # define _LIBCPP_LITTLE_ENDIAN # elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ # define _LIBCPP_BIG_ENDIAN # endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ # endif // __BYTE_ORDER__ # ifdef __FreeBSD__ # include # include # if _BYTE_ORDER == _LITTLE_ENDIAN # define _LIBCPP_LITTLE_ENDIAN # else // _BYTE_ORDER == _LITTLE_ENDIAN # define _LIBCPP_BIG_ENDIAN # endif // _BYTE_ORDER == _LITTLE_ENDIAN # endif // __FreeBSD__ # if defined(__NetBSD__) || defined(__OpenBSD__) # include # if _BYTE_ORDER == _LITTLE_ENDIAN # define _LIBCPP_LITTLE_ENDIAN # else // _BYTE_ORDER == _LITTLE_ENDIAN # define _LIBCPP_BIG_ENDIAN # endif // _BYTE_ORDER == _LITTLE_ENDIAN # endif // defined(__NetBSD__) || defined(__OpenBSD__) # if defined(_WIN32) # define _LIBCPP_WIN32API # define _LIBCPP_LITTLE_ENDIAN # define _LIBCPP_SHORT_WCHAR 1 // Both MinGW and native MSVC provide a "MSVC"-like environment # define _LIBCPP_MSVCRT_LIKE // If mingw not explicitly detected, assume using MS C runtime only if // a MS compatibility version is specified. # if defined(_MSC_VER) && !defined(__MINGW32__) # define _LIBCPP_MSVCRT // Using Microsoft's C Runtime library # endif # if (defined(_M_AMD64) || defined(__x86_64__)) || (defined(_M_ARM) || defined(__arm__)) # define _LIBCPP_HAS_BITSCAN64 # endif # define _LIBCPP_HAS_OPEN_WITH_WCHAR # endif // defined(_WIN32) # if defined(_AIX) && !defined(__64BIT__) // The size of wchar is 2 byte on 32-bit mode on AIX. # define _LIBCPP_SHORT_WCHAR 1 # endif // Libc++ supports various implementations of std::random_device. // // _LIBCPP_USING_DEV_RANDOM // Read entropy from the given file, by default `/dev/urandom`. // If a token is provided, it is assumed to be the path to a file // to read entropy from. This is the default behavior if nothing // else is specified. This implementation requires storing state // inside `std::random_device`. // // _LIBCPP_USING_ARC4_RANDOM // Use arc4random(). This allows obtaining random data even when // using sandboxing mechanisms. On some platforms like Apple, this // is the recommended source of entropy for user-space programs. // When this option is used, the token passed to `std::random_device`'s // constructor *must* be "/dev/urandom" -- anything else is an error. // // _LIBCPP_USING_GETENTROPY // Use getentropy(). // When this option is used, the token passed to `std::random_device`'s // constructor *must* be "/dev/urandom" -- anything else is an error. // // _LIBCPP_USING_FUCHSIA_CPRNG // Use Fuchsia's zx_cprng_draw() system call, which is specified to // deliver high-quality entropy and cannot fail. // When this option is used, the token passed to `std::random_device`'s // constructor *must* be "/dev/urandom" -- anything else is an error. // // _LIBCPP_USING_NACL_RANDOM // NaCl's sandbox (which PNaCl also runs in) doesn't allow filesystem access, // including accesses to the special files under `/dev`. This implementation // uses the NaCL syscall `nacl_secure_random_init()` to get entropy. // When this option is used, the token passed to `std::random_device`'s // constructor *must* be "/dev/urandom" -- anything else is an error. // // _LIBCPP_USING_WIN32_RANDOM // Use rand_s(), for use on Windows. // When this option is used, the token passed to `std::random_device`'s // constructor *must* be "/dev/urandom" -- anything else is an error. # if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__DragonFly__) # define _LIBCPP_USING_ARC4_RANDOM # elif defined(__wasi__) || defined(__EMSCRIPTEN__) # define _LIBCPP_USING_GETENTROPY # elif defined(__Fuchsia__) # define _LIBCPP_USING_FUCHSIA_CPRNG # elif defined(__native_client__) # define _LIBCPP_USING_NACL_RANDOM # elif defined(_LIBCPP_WIN32API) # define _LIBCPP_USING_WIN32_RANDOM # else # define _LIBCPP_USING_DEV_RANDOM # endif # if !defined(_LIBCPP_LITTLE_ENDIAN) && !defined(_LIBCPP_BIG_ENDIAN) # include # if __BYTE_ORDER == __LITTLE_ENDIAN # define _LIBCPP_LITTLE_ENDIAN # elif __BYTE_ORDER == __BIG_ENDIAN # define _LIBCPP_BIG_ENDIAN # else // __BYTE_ORDER == __BIG_ENDIAN # error unable to determine endian # endif # endif // !defined(_LIBCPP_LITTLE_ENDIAN) && !defined(_LIBCPP_BIG_ENDIAN) # if __has_attribute(__no_sanitize__) && !defined(_LIBCPP_COMPILER_GCC) # define _LIBCPP_NO_CFI __attribute__((__no_sanitize__("cfi"))) # else # define _LIBCPP_NO_CFI # endif # ifndef _LIBCPP_CXX03_LANG # define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp) # define _ALIGNAS_TYPE(x) alignas(x) # define _ALIGNAS(x) alignas(x) # define _LIBCPP_NORETURN [[noreturn]] # define _NOEXCEPT noexcept # define _NOEXCEPT_(x) noexcept(x) # define _LIBCPP_CONSTEXPR constexpr # else # define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp) # define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x)))) # define _ALIGNAS(x) __attribute__((__aligned__(x))) # define _LIBCPP_NORETURN __attribute__((__noreturn__)) # define _LIBCPP_HAS_NO_NOEXCEPT # define nullptr __nullptr # define _NOEXCEPT throw() # define _NOEXCEPT_(x) # define static_assert(...) _Static_assert(__VA_ARGS__) # define decltype(...) __decltype(__VA_ARGS__) # define _LIBCPP_CONSTEXPR typedef __char16_t char16_t; typedef __char32_t char32_t; # endif # if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L # define _LIBCPP_HAS_NO_EXCEPTIONS # endif # define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp) # if defined(_LIBCPP_COMPILER_CLANG_BASED) # if defined(__APPLE__) # if defined(__i386__) || defined(__x86_64__) // use old string layout on x86_64 and i386 # elif defined(__arm__) // use old string layout on arm (which does not include aarch64/arm64), except on watch ABIs # if defined(__ARM_ARCH_7K__) && __ARM_ARCH_7K__ >= 2 # define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT # endif # else # define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT # endif # endif // Objective-C++ features (opt-in) # if __has_feature(objc_arc) # define _LIBCPP_HAS_OBJC_ARC # endif # if __has_feature(objc_arc_weak) # define _LIBCPP_HAS_OBJC_ARC_WEAK # endif # if __has_extension(blocks) # define _LIBCPP_HAS_EXTENSION_BLOCKS # endif # if defined(_LIBCPP_HAS_EXTENSION_BLOCKS) && defined(__APPLE__) # define _LIBCPP_HAS_BLOCKS_RUNTIME # endif # if !__has_feature(address_sanitizer) # define _LIBCPP_HAS_NO_ASAN # endif // Allow for build-time disabling of unsigned integer sanitization # if __has_attribute(no_sanitize) # define _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK __attribute__((__no_sanitize__("unsigned-integer-overflow"))) # endif # define _LIBCPP_ALWAYS_INLINE __attribute__((__always_inline__)) # define _LIBCPP_DISABLE_EXTENSION_WARNING __extension__ # elif defined(_LIBCPP_COMPILER_GCC) # if !defined(__SANITIZE_ADDRESS__) # define _LIBCPP_HAS_NO_ASAN # endif # define _LIBCPP_ALWAYS_INLINE __attribute__((__always_inline__)) # define _LIBCPP_DISABLE_EXTENSION_WARNING __extension__ # endif // _LIBCPP_COMPILER_[CLANG|GCC] # if defined(_LIBCPP_OBJECT_FORMAT_COFF) # ifdef _DLL # define _LIBCPP_CRT_FUNC __declspec(dllimport) # else # define _LIBCPP_CRT_FUNC # endif # if defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) || (defined(__MINGW32__) && !defined(_LIBCPP_BUILDING_LIBRARY)) # define _LIBCPP_DLL_VIS # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS # define _LIBCPP_OVERRIDABLE_FUNC_VIS # define _LIBCPP_EXPORTED_FROM_ABI # elif defined(_LIBCPP_BUILDING_LIBRARY) # define _LIBCPP_DLL_VIS __declspec(dllexport) # if defined(__MINGW32__) # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_DLL_VIS # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS # else # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS _LIBCPP_DLL_VIS # endif # define _LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_DLL_VIS # define _LIBCPP_EXPORTED_FROM_ABI __declspec(dllexport) # else # define _LIBCPP_DLL_VIS __declspec(dllimport) # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_DLL_VIS # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS # define _LIBCPP_OVERRIDABLE_FUNC_VIS # define _LIBCPP_EXPORTED_FROM_ABI __declspec(dllimport) # endif # define _LIBCPP_HIDDEN # define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS # define _LIBCPP_TEMPLATE_VIS # define _LIBCPP_TEMPLATE_DATA_VIS # define _LIBCPP_ENUM_VIS # else # if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) # define _LIBCPP_VISIBILITY(vis) __attribute__((__visibility__(vis))) # else # define _LIBCPP_VISIBILITY(vis) # endif # define _LIBCPP_HIDDEN _LIBCPP_VISIBILITY("hidden") # define _LIBCPP_TEMPLATE_DATA_VIS _LIBCPP_VISIBILITY("default") # define _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_VISIBILITY("default") # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_VISIBILITY("default") # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS // TODO: Make this a proper customization point or remove the option to override it. # ifndef _LIBCPP_OVERRIDABLE_FUNC_VIS # define _LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_VISIBILITY("default") # endif # if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) // The inline should be removed once PR32114 is resolved # define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS inline _LIBCPP_HIDDEN # else # define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS # endif # if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) # if __has_attribute(__type_visibility__) # define _LIBCPP_TEMPLATE_VIS __attribute__((__type_visibility__("default"))) # else # define _LIBCPP_TEMPLATE_VIS __attribute__((__visibility__("default"))) # endif # else # define _LIBCPP_TEMPLATE_VIS # endif # if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) && __has_attribute(__type_visibility__) # define _LIBCPP_ENUM_VIS __attribute__((__type_visibility__("default"))) # else # define _LIBCPP_ENUM_VIS # endif # endif // defined(_LIBCPP_OBJECT_FORMAT_COFF) # if __has_attribute(exclude_from_explicit_instantiation) # define _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__((__exclude_from_explicit_instantiation__)) # else // Try to approximate the effect of exclude_from_explicit_instantiation // (which is that entities are not assumed to be provided by explicit // template instantiations in the dylib) by always inlining those entities. # define _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION _LIBCPP_ALWAYS_INLINE # endif // This macro marks a symbol as being hidden from libc++'s ABI. This is achieved // on two levels: // 1. The symbol is given hidden visibility, which ensures that users won't start exporting // symbols from their dynamic library by means of using the libc++ headers. This ensures // that those symbols stay private to the dynamic library in which it is defined. // // 2. The symbol is given an ABI tag that changes with each version of libc++. This ensures // that no ODR violation can arise from mixing two TUs compiled with different versions // of libc++ where we would have changed the definition of a symbol. If the symbols shared // the same name, the ODR would require that their definitions be token-by-token equivalent, // which basically prevents us from being able to make any change to any function in our // headers. Using this ABI tag ensures that the symbol name is "bumped" artificially at // each release, which lets us change the definition of these symbols at our leisure. // Note that historically, this has been achieved in various ways, including force-inlining // all functions or giving internal linkage to all functions. Both these (previous) solutions // suffer from drawbacks that lead notably to code bloat. // // Note that we use _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION to ensure that we don't depend // on _LIBCPP_HIDE_FROM_ABI methods of classes explicitly instantiated in the dynamic library. // // Also note that the _LIBCPP_HIDE_FROM_ABI_VIRTUAL macro should be used on virtual functions // instead of _LIBCPP_HIDE_FROM_ABI. That macro does not use an ABI tag. Indeed, the mangled // name of a virtual function is part of its ABI, since some architectures like arm64e can sign // the virtual function pointer in the vtable based on the mangled name of the function. Since // we use an ABI tag that changes with each released version, the mangled name of the virtual // function would change, which is incorrect. Note that it doesn't make much sense to change // the implementation of a virtual function in an ABI-incompatible way in the first place, // since that would be an ABI break anyway. Hence, the lack of ABI tag should not be noticeable. // // TODO: We provide a escape hatch with _LIBCPP_NO_ABI_TAG for folks who want to avoid increasing // the length of symbols with an ABI tag. In practice, we should remove the escape hatch and // use compression mangling instead, see https://github.com/itanium-cxx-abi/cxx-abi/issues/70. # ifndef _LIBCPP_NO_ABI_TAG # define _LIBCPP_HIDE_FROM_ABI \ _LIBCPP_HIDDEN _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION \ __attribute__((__abi_tag__(_LIBCPP_TOSTRING(_LIBCPP_VERSIONED_IDENTIFIER)))) # else # define _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDDEN _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # endif # define _LIBCPP_HIDE_FROM_ABI_VIRTUAL _LIBCPP_HIDDEN _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION // This macro provides a HIDE_FROM_ABI equivalent that can be applied to extern // "C" function, as those lack mangling. # define _LIBCPP_HIDE_FROM_ABI_C _LIBCPP_HIDDEN _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION # ifdef _LIBCPP_BUILDING_LIBRARY # if _LIBCPP_ABI_VERSION > 1 # define _LIBCPP_HIDE_FROM_ABI_AFTER_V1 _LIBCPP_HIDE_FROM_ABI # else # define _LIBCPP_HIDE_FROM_ABI_AFTER_V1 # endif # else # define _LIBCPP_HIDE_FROM_ABI_AFTER_V1 _LIBCPP_HIDE_FROM_ABI # endif // Just so we can migrate to the new macros gradually. # define _LIBCPP_INLINE_VISIBILITY _LIBCPP_HIDE_FROM_ABI // Inline namespaces are available in Clang/GCC/MSVC regardless of C++ dialect. // clang-format off # define _LIBCPP_BEGIN_NAMESPACE_STD namespace std { inline namespace _LIBCPP_ABI_NAMESPACE { # define _LIBCPP_END_NAMESPACE_STD }} # define _VSTD std _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD # if _LIBCPP_STD_VER >= 17 # define _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM \ _LIBCPP_BEGIN_NAMESPACE_STD inline namespace __fs { namespace filesystem { # else # define _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM \ _LIBCPP_BEGIN_NAMESPACE_STD namespace __fs { namespace filesystem { # endif # define _LIBCPP_END_NAMESPACE_FILESYSTEM _LIBCPP_END_NAMESPACE_STD }} // clang-format on # define _VSTD_FS std::__fs::filesystem # if __has_attribute(__enable_if__) # define _LIBCPP_PREFERRED_OVERLOAD __attribute__((__enable_if__(true, ""))) # endif # if !defined(__SIZEOF_INT128__) || defined(_MSC_VER) # define _LIBCPP_HAS_NO_INT128 # endif # if __has_attribute(__malloc__) # define _LIBCPP_NOALIAS __attribute__((__malloc__)) # else # define _LIBCPP_NOALIAS # endif # if __has_attribute(__using_if_exists__) # define _LIBCPP_USING_IF_EXISTS __attribute__((__using_if_exists__)) # else # define _LIBCPP_USING_IF_EXISTS # endif # ifdef _LIBCPP_CXX03_LANG # define _LIBCPP_DECLARE_STRONG_ENUM(x) \ struct _LIBCPP_EXPORTED_FROM_ABI x { \ enum __lx // clang-format off # define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x) \ __lx __v_; \ _LIBCPP_INLINE_VISIBILITY x(__lx __v) : __v_(__v) {} \ _LIBCPP_INLINE_VISIBILITY explicit x(int __v) : __v_(static_cast<__lx>(__v)) {} \ _LIBCPP_INLINE_VISIBILITY operator int() const { return __v_; } \ }; // clang-format on # else // _LIBCPP_CXX03_LANG # define _LIBCPP_DECLARE_STRONG_ENUM(x) enum class _LIBCPP_ENUM_VIS x # define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x) # endif // _LIBCPP_CXX03_LANG # if defined(__APPLE__) || defined(__FreeBSD__) || defined(_LIBCPP_MSVCRT_LIKE) || defined(__NetBSD__) # define _LIBCPP_LOCALE__L_EXTENSIONS 1 # endif # ifdef __FreeBSD__ # define _DECLARE_C99_LDBL_MATH 1 # endif // If we are getting operator new from the MSVC CRT, then allocation overloads // for align_val_t were added in 19.12, aka VS 2017 version 15.3. # if defined(_LIBCPP_MSVCRT) && defined(_MSC_VER) && _MSC_VER < 1912 # define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION # elif defined(_LIBCPP_ABI_VCRUNTIME) && !defined(__cpp_aligned_new) // We're deferring to Microsoft's STL to provide aligned new et al. We don't // have it unless the language feature test macro is defined. # define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION # elif defined(__MVS__) # define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION # endif # if defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION) || (!defined(__cpp_aligned_new) || __cpp_aligned_new < 201606) # define _LIBCPP_HAS_NO_ALIGNED_ALLOCATION # endif // It is not yet possible to use aligned_alloc() on all Apple platforms since // 10.15 was the first version to ship an implementation of aligned_alloc(). # if defined(__APPLE__) # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \ __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) # define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC # endif # elif defined(__ANDROID__) && __ANDROID_API__ < 28 // Android only provides aligned_alloc when targeting API 28 or higher. # define _LIBCPP_HAS_NO_C11_ALIGNED_ALLOC # endif # if defined(__APPLE__) || defined(__FreeBSD__) # define _LIBCPP_HAS_DEFAULTRUNELOCALE # endif # if defined(__APPLE__) || defined(__FreeBSD__) # define _LIBCPP_WCTYPE_IS_MASK # endif # if _LIBCPP_STD_VER <= 17 || !defined(__cpp_char8_t) # define _LIBCPP_HAS_NO_CHAR8_T # endif // Deprecation macros. // // Deprecations warnings are always enabled, except when users explicitly opt-out // by defining _LIBCPP_DISABLE_DEPRECATION_WARNINGS. # if !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS) # if __has_attribute(__deprecated__) # define _LIBCPP_DEPRECATED __attribute__((__deprecated__)) # define _LIBCPP_DEPRECATED_(m) __attribute__((__deprecated__(m))) # elif _LIBCPP_STD_VER >= 14 # define _LIBCPP_DEPRECATED [[deprecated]] # define _LIBCPP_DEPRECATED_(m) [[deprecated(m)]] # else # define _LIBCPP_DEPRECATED # define _LIBCPP_DEPRECATED_(m) # endif # else # define _LIBCPP_DEPRECATED # define _LIBCPP_DEPRECATED_(m) # endif # if !defined(_LIBCPP_CXX03_LANG) # define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED # else # define _LIBCPP_DEPRECATED_IN_CXX11 # endif # if _LIBCPP_STD_VER >= 14 # define _LIBCPP_DEPRECATED_IN_CXX14 _LIBCPP_DEPRECATED # else # define _LIBCPP_DEPRECATED_IN_CXX14 # endif # if _LIBCPP_STD_VER >= 17 # define _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_DEPRECATED # else # define _LIBCPP_DEPRECATED_IN_CXX17 # endif # if _LIBCPP_STD_VER >= 20 # define _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_DEPRECATED # else # define _LIBCPP_DEPRECATED_IN_CXX20 # endif # if _LIBCPP_STD_VER >= 23 # define _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_DEPRECATED # else # define _LIBCPP_DEPRECATED_IN_CXX23 # endif # if !defined(_LIBCPP_HAS_NO_CHAR8_T) # define _LIBCPP_DEPRECATED_WITH_CHAR8_T _LIBCPP_DEPRECATED # else # define _LIBCPP_DEPRECATED_WITH_CHAR8_T # endif // Macros to enter and leave a state where deprecation warnings are suppressed. # if defined(_LIBCPP_COMPILER_CLANG_BASED) || defined(_LIBCPP_COMPILER_GCC) # define _LIBCPP_SUPPRESS_DEPRECATED_PUSH \ _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated\"") \ _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") # define _LIBCPP_SUPPRESS_DEPRECATED_POP _Pragma("GCC diagnostic pop") # else # define _LIBCPP_SUPPRESS_DEPRECATED_PUSH # define _LIBCPP_SUPPRESS_DEPRECATED_POP # endif # if _LIBCPP_STD_VER <= 11 # define _LIBCPP_EXPLICIT_SINCE_CXX14 # else # define _LIBCPP_EXPLICIT_SINCE_CXX14 explicit # endif # if _LIBCPP_STD_VER >= 23 # define _LIBCPP_EXPLICIT_SINCE_CXX23 explicit # else # define _LIBCPP_EXPLICIT_SINCE_CXX23 # endif # if _LIBCPP_STD_VER >= 14 # define _LIBCPP_CONSTEXPR_SINCE_CXX14 constexpr # else # define _LIBCPP_CONSTEXPR_SINCE_CXX14 # endif # if _LIBCPP_STD_VER >= 17 # define _LIBCPP_CONSTEXPR_SINCE_CXX17 constexpr # else # define _LIBCPP_CONSTEXPR_SINCE_CXX17 # endif # if _LIBCPP_STD_VER >= 20 # define _LIBCPP_CONSTEXPR_SINCE_CXX20 constexpr # else # define _LIBCPP_CONSTEXPR_SINCE_CXX20 # endif # if _LIBCPP_STD_VER >= 23 # define _LIBCPP_CONSTEXPR_SINCE_CXX23 constexpr # else # define _LIBCPP_CONSTEXPR_SINCE_CXX23 # endif # if __has_cpp_attribute(nodiscard) # define _LIBCPP_NODISCARD [[__nodiscard__]] # else // We can't use GCC's [[gnu::warn_unused_result]] and // __attribute__((warn_unused_result)), because GCC does not silence them via // (void) cast. # define _LIBCPP_NODISCARD # endif // _LIBCPP_NODISCARD_EXT may be used to apply [[nodiscard]] to entities not // specified as such as an extension. # if !defined(_LIBCPP_DISABLE_NODISCARD_EXT) # define _LIBCPP_NODISCARD_EXT _LIBCPP_NODISCARD # else # define _LIBCPP_NODISCARD_EXT # endif # if _LIBCPP_STD_VER >= 20 || !defined(_LIBCPP_DISABLE_NODISCARD_EXT) # define _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_NODISCARD # else # define _LIBCPP_NODISCARD_AFTER_CXX17 # endif # if __has_attribute(__no_destroy__) # define _LIBCPP_NO_DESTROY __attribute__((__no_destroy__)) # else # define _LIBCPP_NO_DESTROY # endif # ifndef _LIBCPP_HAS_NO_ASAN extern "C" _LIBCPP_EXPORTED_FROM_ABI void __sanitizer_annotate_contiguous_container(const void*, const void*, const void*, const void*); # if _LIBCPP_CLANG_VER >= 1600 extern "C" _LIBCPP_EXPORTED_FROM_ABI void __sanitizer_annotate_double_ended_contiguous_container( const void*, const void*, const void*, const void*, const void*, const void*); extern "C" _LIBCPP_EXPORTED_FROM_ABI int __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, const void*, const void*); # endif # endif // Try to find out if RTTI is disabled. # if !defined(__cpp_rtti) || __cpp_rtti < 199711L # define _LIBCPP_HAS_NO_RTTI # endif # ifndef _LIBCPP_WEAK # define _LIBCPP_WEAK __attribute__((__weak__)) # endif // Thread API // clang-format off # if !defined(_LIBCPP_HAS_NO_THREADS) && \ !defined(_LIBCPP_HAS_THREAD_API_PTHREAD) && \ !defined(_LIBCPP_HAS_THREAD_API_WIN32) && \ !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL) # if defined(__FreeBSD__) || \ defined(__wasi__) || \ defined(__NetBSD__) || \ defined(__OpenBSD__) || \ defined(__NuttX__) || \ defined(__linux__) || \ defined(__GNU__) || \ defined(__APPLE__) || \ defined(__MVS__) || \ defined(_AIX) || \ defined(__EMSCRIPTEN__) // clang-format on # define _LIBCPP_HAS_THREAD_API_PTHREAD # elif defined(__Fuchsia__) // TODO(44575): Switch to C11 thread API when possible. # define _LIBCPP_HAS_THREAD_API_PTHREAD # elif defined(_LIBCPP_WIN32API) # define _LIBCPP_HAS_THREAD_API_WIN32 # else # error "No thread API" # endif // _LIBCPP_HAS_THREAD_API # endif // _LIBCPP_HAS_NO_THREADS # if defined(_LIBCPP_HAS_THREAD_API_PTHREAD) # if defined(__ANDROID__) && __ANDROID_API__ >= 30 # define _LIBCPP_HAS_COND_CLOCKWAIT # elif defined(_LIBCPP_GLIBC_PREREQ) # if _LIBCPP_GLIBC_PREREQ(2, 30) # define _LIBCPP_HAS_COND_CLOCKWAIT # endif # endif # endif # if defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD) # error _LIBCPP_HAS_THREAD_API_PTHREAD may only be defined when \ _LIBCPP_HAS_NO_THREADS is not defined. # endif # if defined(_LIBCPP_HAS_NO_THREADS) && defined(_LIBCPP_HAS_THREAD_API_EXTERNAL) # error _LIBCPP_HAS_THREAD_API_EXTERNAL may not be defined when \ _LIBCPP_HAS_NO_THREADS is defined. # endif # if defined(_LIBCPP_HAS_NO_MONOTONIC_CLOCK) && !defined(_LIBCPP_HAS_NO_THREADS) # error _LIBCPP_HAS_NO_MONOTONIC_CLOCK may only be defined when \ _LIBCPP_HAS_NO_THREADS is defined. # endif # if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(__STDCPP_THREADS__) # define __STDCPP_THREADS__ 1 # endif // The glibc and Bionic implementation of pthreads implements // pthread_mutex_destroy as nop for regular mutexes. Additionally, Win32 // mutexes have no destroy mechanism. // // This optimization can't be performed on Apple platforms, where // pthread_mutex_destroy can allow the kernel to release resources. // See https://llvm.org/D64298 for details. // // TODO(EricWF): Enable this optimization on Bionic after speaking to their // respective stakeholders. // clang-format off # if (defined(_LIBCPP_HAS_THREAD_API_PTHREAD) && defined(__GLIBC__)) || \ (defined(_LIBCPP_HAS_THREAD_API_C11) && defined(__Fuchsia__)) || \ defined(_LIBCPP_HAS_THREAD_API_WIN32) // clang-format on # define _LIBCPP_HAS_TRIVIAL_MUTEX_DESTRUCTION # endif // Destroying a condvar is a nop on Windows. // // This optimization can't be performed on Apple platforms, where // pthread_cond_destroy can allow the kernel to release resources. // See https://llvm.org/D64298 for details. // // TODO(EricWF): This is potentially true for some pthread implementations // as well. # if (defined(_LIBCPP_HAS_THREAD_API_C11) && defined(__Fuchsia__)) || defined(_LIBCPP_HAS_THREAD_API_WIN32) # define _LIBCPP_HAS_TRIVIAL_CONDVAR_DESTRUCTION # endif // Some systems do not provide gets() in their C library, for security reasons. # if defined(_LIBCPP_MSVCRT) || (defined(__FreeBSD_version) && __FreeBSD_version >= 1300043) || defined(__OpenBSD__) # define _LIBCPP_C_HAS_NO_GETS # endif # if defined(__BIONIC__) || defined(__NuttX__) || defined(__Fuchsia__) || defined(__wasi__) || \ defined(_LIBCPP_HAS_MUSL_LIBC) || defined(__OpenBSD__) # define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE # endif # if __has_feature(cxx_atomic) || __has_extension(c_atomic) || __has_keyword(_Atomic) # define _LIBCPP_HAS_C_ATOMIC_IMP # elif defined(_LIBCPP_COMPILER_GCC) # define _LIBCPP_HAS_GCC_ATOMIC_IMP # endif # if !defined(_LIBCPP_HAS_C_ATOMIC_IMP) && !defined(_LIBCPP_HAS_GCC_ATOMIC_IMP) && \ !defined(_LIBCPP_HAS_EXTERNAL_ATOMIC_IMP) # define _LIBCPP_HAS_NO_ATOMIC_HEADER # else # ifndef _LIBCPP_ATOMIC_FLAG_TYPE # define _LIBCPP_ATOMIC_FLAG_TYPE bool # endif # ifdef _LIBCPP_FREESTANDING # define _LIBCPP_ATOMIC_ONLY_USE_BUILTINS # endif # endif # ifndef _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK # define _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK # endif # if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(__no_thread_safety_analysis__) # define _LIBCPP_NO_THREAD_SAFETY_ANALYSIS __attribute__((__no_thread_safety_analysis__)) # else # define _LIBCPP_NO_THREAD_SAFETY_ANALYSIS # endif # if defined(_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS) # if defined(__clang__) && __has_attribute(acquire_capability) // Work around the attribute handling in clang. When both __declspec and // __attribute__ are present, the processing goes awry preventing the definition // of the types. In MinGW mode, __declspec evaluates to __attribute__, and thus // combining the two does work. # if !defined(_MSC_VER) # define _LIBCPP_HAS_THREAD_SAFETY_ANNOTATIONS # endif # endif # endif # ifdef _LIBCPP_HAS_THREAD_SAFETY_ANNOTATIONS # define _LIBCPP_THREAD_SAFETY_ANNOTATION(x) __attribute__((x)) # else # define _LIBCPP_THREAD_SAFETY_ANNOTATION(x) # endif # if _LIBCPP_STD_VER >= 20 # define _LIBCPP_CONSTINIT constinit # elif __has_attribute(__require_constant_initialization__) # define _LIBCPP_CONSTINIT __attribute__((__require_constant_initialization__)) # else # define _LIBCPP_CONSTINIT # endif # if __has_attribute(__diagnose_if__) && !defined(_LIBCPP_DISABLE_ADDITIONAL_DIAGNOSTICS) # define _LIBCPP_DIAGNOSE_WARNING(...) __attribute__((__diagnose_if__(__VA_ARGS__, "warning"))) # else # define _LIBCPP_DIAGNOSE_WARNING(...) # endif // Use a function like macro to imply that it must be followed by a semicolon # if __has_cpp_attribute(fallthrough) # define _LIBCPP_FALLTHROUGH() [[fallthrough]] # elif __has_attribute(__fallthrough__) # define _LIBCPP_FALLTHROUGH() __attribute__((__fallthrough__)) # else # define _LIBCPP_FALLTHROUGH() ((void)0) # endif # if __has_cpp_attribute(_Clang::__lifetimebound__) # define _LIBCPP_LIFETIMEBOUND [[_Clang::__lifetimebound__]] # else # define _LIBCPP_LIFETIMEBOUND # endif # if __has_attribute(__nodebug__) # define _LIBCPP_NODEBUG __attribute__((__nodebug__)) # else # define _LIBCPP_NODEBUG # endif # if __has_attribute(__standalone_debug__) # define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__)) # else # define _LIBCPP_STANDALONE_DEBUG # endif # if __has_attribute(__preferred_name__) # define _LIBCPP_PREFERRED_NAME(x) __attribute__((__preferred_name__(x))) # else # define _LIBCPP_PREFERRED_NAME(x) # endif # if __has_attribute(__no_sanitize__) # define _LIBCPP_NO_SANITIZE(...) __attribute__((__no_sanitize__(__VA_ARGS__))) # else # define _LIBCPP_NO_SANITIZE(...) # endif // We often repeat things just for handling wide characters in the library. // When wide characters are disabled, it can be useful to have a quick way of // disabling it without having to resort to #if-#endif, which has a larger // impact on readability. # if defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) # define _LIBCPP_IF_WIDE_CHARACTERS(...) # else # define _LIBCPP_IF_WIDE_CHARACTERS(...) __VA_ARGS__ # endif # if defined(_LIBCPP_ABI_MICROSOFT) && __has_declspec_attribute(empty_bases) # define _LIBCPP_DECLSPEC_EMPTY_BASES __declspec(empty_bases) # else # define _LIBCPP_DECLSPEC_EMPTY_BASES # endif # if defined(_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES) # define _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR # define _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS # define _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE # define _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS # define _LIBCPP_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION # endif // _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES # if defined(_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES) # define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS # define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION # define _LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS # define _LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS # define _LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR # define _LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS # endif // _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES // clang-format off # define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh()\")") _Pragma("push_macro(\"move(int, int)\")") _Pragma("push_macro(\"erase()\")") # define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh()\")") _Pragma("pop_macro(\"move(int, int)\")") _Pragma("pop_macro(\"erase()\")") // clang-format on # ifndef _LIBCPP_NO_AUTO_LINK # if defined(_LIBCPP_ABI_MICROSOFT) && !defined(_LIBCPP_BUILDING_LIBRARY) # if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) # pragma comment(lib, "c++.lib") # else # pragma comment(lib, "libc++.lib") # endif # endif // defined(_LIBCPP_ABI_MICROSOFT) && !defined(_LIBCPP_BUILDING_LIBRARY) # endif // _LIBCPP_NO_AUTO_LINK // Configures the fopen close-on-exec mode character, if any. This string will // be appended to any mode string used by fstream for fopen/fdopen. // // Not all platforms support this, but it helps avoid fd-leaks on platforms that // do. # if defined(__BIONIC__) # define _LIBCPP_FOPEN_CLOEXEC_MODE "e" # else # define _LIBCPP_FOPEN_CLOEXEC_MODE # endif // Support for _FILE_OFFSET_BITS=64 landed gradually in Android, so the full set // of functions used in cstdio may not be available for low API levels when // using 64-bit file offsets on LP32. # if defined(__BIONIC__) && defined(__USE_FILE_OFFSET64) && __ANDROID_API__ < 24 # define _LIBCPP_HAS_NO_FGETPOS_FSETPOS # endif # if __has_attribute(__init_priority__) # define _LIBCPP_INIT_PRIORITY_MAX __attribute__((__init_priority__(100))) # else # define _LIBCPP_INIT_PRIORITY_MAX # endif # if __has_attribute(__format__) // The attribute uses 1-based indices for ordinary and static member functions. // The attribute uses 2-based indices for non-static member functions. # define _LIBCPP_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) \ __attribute__((__format__(archetype, format_string_index, first_format_arg_index))) # else # define _LIBCPP_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) /* nothing */ # endif # if __has_cpp_attribute(msvc::no_unique_address) // MSVC implements [[no_unique_address]] as a silent no-op currently. // (If/when MSVC breaks its C++ ABI, it will be changed to work as intended.) // However, MSVC implements [[msvc::no_unique_address]] which does what // [[no_unique_address]] is supposed to do, in general. // Clang-cl does not yet (14.0) implement either [[no_unique_address]] or // [[msvc::no_unique_address]] though. If/when it does implement // [[msvc::no_unique_address]], this should be preferred though. # define _LIBCPP_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]] # elif __has_cpp_attribute(no_unique_address) # define _LIBCPP_NO_UNIQUE_ADDRESS [[__no_unique_address__]] # else # define _LIBCPP_NO_UNIQUE_ADDRESS /* nothing */ // Note that this can be replaced by #error as soon as clang-cl // implements msvc::no_unique_address, since there should be no C++20 // compiler that doesn't support one of the two attributes at that point. // We generally don't want to use this macro outside of C++20-only code, // because using it conditionally in one language version only would make // the ABI inconsistent. # endif # ifdef _LIBCPP_COMPILER_CLANG_BASED # define _LIBCPP_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") # define _LIBCPP_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") # define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED(str) _Pragma(_LIBCPP_TOSTRING(clang diagnostic ignored str)) # define _LIBCPP_GCC_DIAGNOSTIC_IGNORED(str) # elif defined(_LIBCPP_COMPILER_GCC) # define _LIBCPP_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") # define _LIBCPP_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") # define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED(str) # define _LIBCPP_GCC_DIAGNOSTIC_IGNORED(str) _Pragma(_LIBCPP_TOSTRING(GCC diagnostic ignored str)) # else # define _LIBCPP_DIAGNOSTIC_PUSH # define _LIBCPP_DIAGNOSTIC_POP # define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED(str) # define _LIBCPP_GCC_DIAGNOSTIC_IGNORED(str) # endif # if defined(_AIX) && !defined(_LIBCPP_COMPILER_GCC) # define _LIBCPP_PACKED_BYTE_FOR_AIX _Pragma("pack(1)") # define _LIBCPP_PACKED_BYTE_FOR_AIX_END _Pragma("pack(pop)") # else # define _LIBCPP_PACKED_BYTE_FOR_AIX /* empty */ # define _LIBCPP_PACKED_BYTE_FOR_AIX_END /* empty */ # endif # if __has_attribute(__packed__) # define _LIBCPP_PACKED __attribute__((__packed__)) # else # define _LIBCPP_PACKED # endif // c8rtomb() and mbrtoc8() were added in C++20 and C23. Support for these // functions is gradually being added to existing C libraries. The conditions // below check for known C library versions and conditions under which these // functions are declared by the C library. # define _LIBCPP_HAS_NO_C8RTOMB_MBRTOC8 // GNU libc 2.36 and newer declare c8rtomb() and mbrtoc8() in C++ modes if // __cpp_char8_t is defined or if C2X extensions are enabled. Determining // the latter depends on internal GNU libc details that are not appropriate // to depend on here, so any declarations present when __cpp_char8_t is not // defined are ignored. # if defined(_LIBCPP_GLIBC_PREREQ) # if _LIBCPP_GLIBC_PREREQ(2, 36) && defined(__cpp_char8_t) # undef _LIBCPP_HAS_NO_C8RTOMB_MBRTOC8 # endif # endif // There are a handful of public standard library types that are intended to // support CTAD but don't need any explicit deduction guides to do so. This // macro is used to mark them as such, which suppresses the // '-Wctad-maybe-unsupported' compiler warning when CTAD is used in user code // with these classes. # if _LIBCPP_STD_VER >= 17 # ifdef _LIBCPP_COMPILER_CLANG_BASED # define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(_ClassName) \ template \ [[maybe_unused]] _ClassName(typename _Tag::__allow_ctad...)->_ClassName<_Tag...> # else # define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(ClassName) \ template \ ClassName(typename _Tag::__allow_ctad...)->ClassName<_Tag...> # endif # else # define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "") # endif // TODO(varconst): currently, there are bugs in Clang's intrinsics when handling Objective-C++ `id`, so don't use // compiler intrinsics in the Objective-C++ mode. # ifdef __OBJC__ # define _LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS # endif # define _PSTL_PRAGMA(x) _Pragma(#x) // Enable SIMD for compilers that support OpenMP 4.0 # if (defined(_OPENMP) && _OPENMP >= 201307) # define _PSTL_UDR_PRESENT # define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd) # define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd) # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM)) # define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM)) # define _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(PRM) _PSTL_PRAGMA(omp scan inclusive(PRM)) # define _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(PRM) _PSTL_PRAGMA(omp scan exclusive(PRM)) // Declaration of reduction functor, where // NAME - the name of the functor // OP - type of the callable object with the reduction operation // omp_in - refers to the local partial result // omp_out - refers to the final value of the combiner operator // omp_priv - refers to the private copy of the initial value // omp_orig - refers to the original variable to be reduced # define _PSTL_PRAGMA_DECLARE_REDUCTION(NAME, OP) \ _PSTL_PRAGMA(omp declare reduction(NAME:OP : omp_out(omp_in)) initializer(omp_priv = omp_orig)) # else // (defined(_OPENMP) && _OPENMP >= 201307) # define _PSTL_PRAGMA_SIMD # define _PSTL_PRAGMA_DECLARE_SIMD # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) # define _PSTL_PRAGMA_SIMD_SCAN(PRM) # define _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(PRM) # define _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(PRM) # define _PSTL_PRAGMA_DECLARE_REDUCTION(NAME, OP) # endif // (defined(_OPENMP) && _OPENMP >= 201307) # define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED #endif // __cplusplus #endif // _LIBCPP___CONFIG diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h index b109b7f7e65a..7b2bfdac75a8 100644 --- a/llvm/include/llvm/Analysis/LazyValueInfo.h +++ b/llvm/include/llvm/Analysis/LazyValueInfo.h @@ -1,171 +1,174 @@ //===- LazyValueInfo.h - Value constraint analysis --------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interface for lazy computation of value constraint // information. // //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_LAZYVALUEINFO_H #define LLVM_ANALYSIS_LAZYVALUEINFO_H #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" namespace llvm { class AssumptionCache; class Constant; class ConstantRange; class DataLayout; class DominatorTree; class Instruction; class TargetLibraryInfo; class Value; /// This pass computes, caches, and vends lazy value constraint information. class LazyValueInfo { friend class LazyValueInfoWrapperPass; AssumptionCache *AC = nullptr; const DataLayout *DL = nullptr; class TargetLibraryInfo *TLI = nullptr; void *PImpl = nullptr; LazyValueInfo(const LazyValueInfo&) = delete; void operator=(const LazyValueInfo&) = delete; public: ~LazyValueInfo(); LazyValueInfo() = default; LazyValueInfo(AssumptionCache *AC_, const DataLayout *DL_, TargetLibraryInfo *TLI_) : AC(AC_), DL(DL_), TLI(TLI_) {} LazyValueInfo(LazyValueInfo &&Arg) : AC(Arg.AC), DL(Arg.DL), TLI(Arg.TLI), PImpl(Arg.PImpl) { Arg.PImpl = nullptr; } LazyValueInfo &operator=(LazyValueInfo &&Arg) { releaseMemory(); AC = Arg.AC; DL = Arg.DL; TLI = Arg.TLI; PImpl = Arg.PImpl; Arg.PImpl = nullptr; return *this; } /// This is used to return true/false/dunno results. enum Tristate { Unknown = -1, False = 0, True = 1 }; // Public query interface. /// Determine whether the specified value comparison with a constant is known /// to be true or false on the specified CFG edge. /// Pred is a CmpInst predicate. Tristate getPredicateOnEdge(unsigned Pred, Value *V, Constant *C, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI = nullptr); /// Determine whether the specified value comparison with a constant is known /// to be true or false at the specified instruction. /// \p Pred is a CmpInst predicate. If \p UseBlockValue is true, the block /// value is also taken into account. Tristate getPredicateAt(unsigned Pred, Value *V, Constant *C, Instruction *CxtI, bool UseBlockValue); /// Determine whether the specified value comparison is known to be true /// or false at the specified instruction. While this takes two Value's, /// it still requires that one of them is a constant. /// \p Pred is a CmpInst predicate. /// If \p UseBlockValue is true, the block value is also taken into account. Tristate getPredicateAt(unsigned Pred, Value *LHS, Value *RHS, Instruction *CxtI, bool UseBlockValue); /// Determine whether the specified value is known to be a constant at the /// specified instruction. Return null if not. Constant *getConstant(Value *V, Instruction *CxtI); /// Return the ConstantRange constraint that is known to hold for the /// specified value at the specified instruction. This may only be called /// on integer-typed Values. ConstantRange getConstantRange(Value *V, Instruction *CxtI, bool UndefAllowed = true); /// Return the ConstantRange constraint that is known to hold for the value /// at a specific use-site. ConstantRange getConstantRangeAtUse(const Use &U, bool UndefAllowed = true); /// Determine whether the specified value is known to be a /// constant on the specified edge. Return null if not. Constant *getConstantOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI = nullptr); /// Return the ConstantRage constraint that is known to hold for the /// specified value on the specified edge. This may be only be called /// on integer-typed Values. ConstantRange getConstantRangeOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI = nullptr); /// Inform the analysis cache that we have threaded an edge from /// PredBB to OldSucc to be from PredBB to NewSucc instead. void threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc); + /// Remove information related to this value from the cache. + void forgetValue(Value *V); + /// Inform the analysis cache that we have erased a block. void eraseBlock(BasicBlock *BB); /// Complete flush all previously computed values void clear(const Module *M); /// Print the \LazyValueInfo Analysis. /// We pass in the DTree that is required for identifying which basic blocks /// we can solve/print for, in the LVIPrinter. void printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS); // For old PM pass. Delete once LazyValueInfoWrapperPass is gone. void releaseMemory(); /// Handle invalidation events in the new pass manager. bool invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv); }; /// Analysis to compute lazy value information. class LazyValueAnalysis : public AnalysisInfoMixin { public: typedef LazyValueInfo Result; Result run(Function &F, FunctionAnalysisManager &FAM); private: static AnalysisKey Key; friend struct AnalysisInfoMixin; }; /// Wrapper around LazyValueInfo. class LazyValueInfoWrapperPass : public FunctionPass { LazyValueInfoWrapperPass(const LazyValueInfoWrapperPass&) = delete; void operator=(const LazyValueInfoWrapperPass&) = delete; public: static char ID; LazyValueInfoWrapperPass(); ~LazyValueInfoWrapperPass() override { assert(!Info.PImpl && "releaseMemory not called"); } LazyValueInfo &getLVI(); void getAnalysisUsage(AnalysisUsage &AU) const override; void releaseMemory() override; bool runOnFunction(Function &F) override; private: LazyValueInfo Info; }; } // end namespace llvm #endif diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 33651783cb17..2ba6036056d9 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1,2075 +1,2084 @@ //===- LazyValueInfo.cpp - Value constraint analysis ------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interface for lazy computation of value constraint // information. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/AssemblyAnnotationWriter.h" #include "llvm/IR/CFG.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" #include using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "lazy-value-info" // This is the number of worklist items we will process to try to discover an // answer for a given value. static const unsigned MaxProcessedPerValue = 500; char LazyValueInfoWrapperPass::ID = 0; LazyValueInfoWrapperPass::LazyValueInfoWrapperPass() : FunctionPass(ID) { initializeLazyValueInfoWrapperPassPass(*PassRegistry::getPassRegistry()); } INITIALIZE_PASS_BEGIN(LazyValueInfoWrapperPass, "lazy-value-info", "Lazy Value Information Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LazyValueInfoWrapperPass, "lazy-value-info", "Lazy Value Information Analysis", false, true) namespace llvm { FunctionPass *createLazyValueInfoPass() { return new LazyValueInfoWrapperPass(); } } AnalysisKey LazyValueAnalysis::Key; /// Returns true if this lattice value represents at most one possible value. /// This is as precise as any lattice value can get while still representing /// reachable code. static bool hasSingleValue(const ValueLatticeElement &Val) { if (Val.isConstantRange() && Val.getConstantRange().isSingleElement()) // Integer constants are single element ranges return true; if (Val.isConstant()) // Non integer constants return true; return false; } /// Combine two sets of facts about the same value into a single set of /// facts. Note that this method is not suitable for merging facts along /// different paths in a CFG; that's what the mergeIn function is for. This /// is for merging facts gathered about the same value at the same location /// through two independent means. /// Notes: /// * This method does not promise to return the most precise possible lattice /// value implied by A and B. It is allowed to return any lattice element /// which is at least as strong as *either* A or B (unless our facts /// conflict, see below). /// * Due to unreachable code, the intersection of two lattice values could be /// contradictory. If this happens, we return some valid lattice value so as /// not confuse the rest of LVI. Ideally, we'd always return Undefined, but /// we do not make this guarantee. TODO: This would be a useful enhancement. static ValueLatticeElement intersect(const ValueLatticeElement &A, const ValueLatticeElement &B) { // Undefined is the strongest state. It means the value is known to be along // an unreachable path. if (A.isUnknown()) return A; if (B.isUnknown()) return B; // If we gave up for one, but got a useable fact from the other, use it. if (A.isOverdefined()) return B; if (B.isOverdefined()) return A; // Can't get any more precise than constants. if (hasSingleValue(A)) return A; if (hasSingleValue(B)) return B; // Could be either constant range or not constant here. if (!A.isConstantRange() || !B.isConstantRange()) { // TODO: Arbitrary choice, could be improved return A; } // Intersect two constant ranges ConstantRange Range = A.getConstantRange().intersectWith(B.getConstantRange()); // Note: An empty range is implicitly converted to unknown or undef depending // on MayIncludeUndef internally. return ValueLatticeElement::getRange( std::move(Range), /*MayIncludeUndef=*/A.isConstantRangeIncludingUndef() || B.isConstantRangeIncludingUndef()); } //===----------------------------------------------------------------------===// // LazyValueInfoCache Decl //===----------------------------------------------------------------------===// namespace { /// A callback value handle updates the cache when values are erased. class LazyValueInfoCache; struct LVIValueHandle final : public CallbackVH { LazyValueInfoCache *Parent; LVIValueHandle(Value *V, LazyValueInfoCache *P = nullptr) : CallbackVH(V), Parent(P) { } void deleted() override; void allUsesReplacedWith(Value *V) override { deleted(); } }; } // end anonymous namespace namespace { using NonNullPointerSet = SmallDenseSet, 2>; /// This is the cache kept by LazyValueInfo which /// maintains information about queries across the clients' queries. class LazyValueInfoCache { /// This is all of the cached information for one basic block. It contains /// the per-value lattice elements, as well as a separate set for /// overdefined values to reduce memory usage. Additionally pointers /// dereferenced in the block are cached for nullability queries. struct BlockCacheEntry { SmallDenseMap, ValueLatticeElement, 4> LatticeElements; SmallDenseSet, 4> OverDefined; // std::nullopt indicates that the nonnull pointers for this basic block // block have not been computed yet. std::optional NonNullPointers; }; /// Cached information per basic block. DenseMap, std::unique_ptr> BlockCache; /// Set of value handles used to erase values from the cache on deletion. DenseSet> ValueHandles; const BlockCacheEntry *getBlockEntry(BasicBlock *BB) const { auto It = BlockCache.find_as(BB); if (It == BlockCache.end()) return nullptr; return It->second.get(); } BlockCacheEntry *getOrCreateBlockEntry(BasicBlock *BB) { auto It = BlockCache.find_as(BB); if (It == BlockCache.end()) It = BlockCache.insert({ BB, std::make_unique() }) .first; return It->second.get(); } void addValueHandle(Value *Val) { auto HandleIt = ValueHandles.find_as(Val); if (HandleIt == ValueHandles.end()) ValueHandles.insert({ Val, this }); } public: void insertResult(Value *Val, BasicBlock *BB, const ValueLatticeElement &Result) { BlockCacheEntry *Entry = getOrCreateBlockEntry(BB); // Insert over-defined values into their own cache to reduce memory // overhead. if (Result.isOverdefined()) Entry->OverDefined.insert(Val); else Entry->LatticeElements.insert({ Val, Result }); addValueHandle(Val); } std::optional getCachedValueInfo(Value *V, BasicBlock *BB) const { const BlockCacheEntry *Entry = getBlockEntry(BB); if (!Entry) return std::nullopt; if (Entry->OverDefined.count(V)) return ValueLatticeElement::getOverdefined(); auto LatticeIt = Entry->LatticeElements.find_as(V); if (LatticeIt == Entry->LatticeElements.end()) return std::nullopt; return LatticeIt->second; } bool isNonNullAtEndOfBlock( Value *V, BasicBlock *BB, function_ref InitFn) { BlockCacheEntry *Entry = getOrCreateBlockEntry(BB); if (!Entry->NonNullPointers) { Entry->NonNullPointers = InitFn(BB); for (Value *V : *Entry->NonNullPointers) addValueHandle(V); } return Entry->NonNullPointers->count(V); } /// clear - Empty the cache. void clear() { BlockCache.clear(); ValueHandles.clear(); } /// Inform the cache that a given value has been deleted. void eraseValue(Value *V); /// This is part of the update interface to inform the cache /// that a block has been deleted. void eraseBlock(BasicBlock *BB); /// Updates the cache to remove any influence an overdefined value in /// OldSucc might have (unless also overdefined in NewSucc). This just /// flushes elements from the cache and does not add any. void threadEdgeImpl(BasicBlock *OldSucc,BasicBlock *NewSucc); }; } void LazyValueInfoCache::eraseValue(Value *V) { for (auto &Pair : BlockCache) { Pair.second->LatticeElements.erase(V); Pair.second->OverDefined.erase(V); if (Pair.second->NonNullPointers) Pair.second->NonNullPointers->erase(V); } auto HandleIt = ValueHandles.find_as(V); if (HandleIt != ValueHandles.end()) ValueHandles.erase(HandleIt); } void LVIValueHandle::deleted() { // This erasure deallocates *this, so it MUST happen after we're done // using any and all members of *this. Parent->eraseValue(*this); } void LazyValueInfoCache::eraseBlock(BasicBlock *BB) { BlockCache.erase(BB); } void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc, BasicBlock *NewSucc) { // When an edge in the graph has been threaded, values that we could not // determine a value for before (i.e. were marked overdefined) may be // possible to solve now. We do NOT try to proactively update these values. // Instead, we clear their entries from the cache, and allow lazy updating to // recompute them when needed. // The updating process is fairly simple: we need to drop cached info // for all values that were marked overdefined in OldSucc, and for those same // values in any successor of OldSucc (except NewSucc) in which they were // also marked overdefined. std::vector worklist; worklist.push_back(OldSucc); const BlockCacheEntry *Entry = getBlockEntry(OldSucc); if (!Entry || Entry->OverDefined.empty()) return; // Nothing to process here. SmallVector ValsToClear(Entry->OverDefined.begin(), Entry->OverDefined.end()); // Use a worklist to perform a depth-first search of OldSucc's successors. // NOTE: We do not need a visited list since any blocks we have already // visited will have had their overdefined markers cleared already, and we // thus won't loop to their successors. while (!worklist.empty()) { BasicBlock *ToUpdate = worklist.back(); worklist.pop_back(); // Skip blocks only accessible through NewSucc. if (ToUpdate == NewSucc) continue; // If a value was marked overdefined in OldSucc, and is here too... auto OI = BlockCache.find_as(ToUpdate); if (OI == BlockCache.end() || OI->second->OverDefined.empty()) continue; auto &ValueSet = OI->second->OverDefined; bool changed = false; for (Value *V : ValsToClear) { if (!ValueSet.erase(V)) continue; // If we removed anything, then we potentially need to update // blocks successors too. changed = true; } if (!changed) continue; llvm::append_range(worklist, successors(ToUpdate)); } } namespace { /// An assembly annotator class to print LazyValueCache information in /// comments. class LazyValueInfoImpl; class LazyValueInfoAnnotatedWriter : public AssemblyAnnotationWriter { LazyValueInfoImpl *LVIImpl; // While analyzing which blocks we can solve values for, we need the dominator // information. DominatorTree &DT; public: LazyValueInfoAnnotatedWriter(LazyValueInfoImpl *L, DominatorTree &DTree) : LVIImpl(L), DT(DTree) {} void emitBasicBlockStartAnnot(const BasicBlock *BB, formatted_raw_ostream &OS) override; void emitInstructionAnnot(const Instruction *I, formatted_raw_ostream &OS) override; }; } namespace { // The actual implementation of the lazy analysis and update. Note that the // inheritance from LazyValueInfoCache is intended to be temporary while // splitting the code and then transitioning to a has-a relationship. class LazyValueInfoImpl { /// Cached results from previous queries LazyValueInfoCache TheCache; /// This stack holds the state of the value solver during a query. /// It basically emulates the callstack of the naive /// recursive value lookup process. SmallVector, 8> BlockValueStack; /// Keeps track of which block-value pairs are in BlockValueStack. DenseSet > BlockValueSet; /// Push BV onto BlockValueStack unless it's already in there. /// Returns true on success. bool pushBlockValue(const std::pair &BV) { if (!BlockValueSet.insert(BV).second) return false; // It's already in the stack. LLVM_DEBUG(dbgs() << "PUSH: " << *BV.second << " in " << BV.first->getName() << "\n"); BlockValueStack.push_back(BV); return true; } AssumptionCache *AC; ///< A pointer to the cache of @llvm.assume calls. const DataLayout &DL; ///< A mandatory DataLayout /// Declaration of the llvm.experimental.guard() intrinsic, /// if it exists in the module. Function *GuardDecl; std::optional getBlockValue(Value *Val, BasicBlock *BB, Instruction *CxtI); std::optional getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T, Instruction *CxtI = nullptr); // These methods process one work item and may add more. A false value // returned means that the work item was not completely processed and must // be revisited after going through the new items. bool solveBlockValue(Value *Val, BasicBlock *BB); std::optional solveBlockValueImpl(Value *Val, BasicBlock *BB); std::optional solveBlockValueNonLocal(Value *Val, BasicBlock *BB); std::optional solveBlockValuePHINode(PHINode *PN, BasicBlock *BB); std::optional solveBlockValueSelect(SelectInst *S, BasicBlock *BB); std::optional getRangeFor(Value *V, Instruction *CxtI, BasicBlock *BB); std::optional solveBlockValueBinaryOpImpl( Instruction *I, BasicBlock *BB, std::function OpFn); std::optional solveBlockValueBinaryOp(BinaryOperator *BBI, BasicBlock *BB); std::optional solveBlockValueCast(CastInst *CI, BasicBlock *BB); std::optional solveBlockValueOverflowIntrinsic(WithOverflowInst *WO, BasicBlock *BB); std::optional solveBlockValueIntrinsic(IntrinsicInst *II, BasicBlock *BB); std::optional solveBlockValueExtractValue(ExtractValueInst *EVI, BasicBlock *BB); bool isNonNullAtEndOfBlock(Value *Val, BasicBlock *BB); void intersectAssumeOrGuardBlockValueConstantRange(Value *Val, ValueLatticeElement &BBLV, Instruction *BBI); void solve(); public: /// This is the query interface to determine the lattice value for the /// specified Value* at the context instruction (if specified) or at the /// start of the block. ValueLatticeElement getValueInBlock(Value *V, BasicBlock *BB, Instruction *CxtI = nullptr); /// This is the query interface to determine the lattice value for the /// specified Value* at the specified instruction using only information /// from assumes/guards and range metadata. Unlike getValueInBlock(), no /// recursive query is performed. ValueLatticeElement getValueAt(Value *V, Instruction *CxtI); /// This is the query interface to determine the lattice /// value for the specified Value* that is true on the specified edge. ValueLatticeElement getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI = nullptr); /// Complete flush all previously computed values void clear() { TheCache.clear(); } /// Printing the LazyValueInfo Analysis. void printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) { LazyValueInfoAnnotatedWriter Writer(this, DTree); F.print(OS, &Writer); } + /// This is part of the update interface to remove information related to this + /// value from the cache. + void forgetValue(Value *V) { TheCache.eraseValue(V); } + /// This is part of the update interface to inform the cache /// that a block has been deleted. void eraseBlock(BasicBlock *BB) { TheCache.eraseBlock(BB); } /// This is the update interface to inform the cache that an edge from /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc. void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc); LazyValueInfoImpl(AssumptionCache *AC, const DataLayout &DL, Function *GuardDecl) : AC(AC), DL(DL), GuardDecl(GuardDecl) {} }; } // end anonymous namespace void LazyValueInfoImpl::solve() { SmallVector, 8> StartingStack( BlockValueStack.begin(), BlockValueStack.end()); unsigned processedCount = 0; while (!BlockValueStack.empty()) { processedCount++; // Abort if we have to process too many values to get a result for this one. // Because of the design of the overdefined cache currently being per-block // to avoid naming-related issues (IE it wants to try to give different // results for the same name in different blocks), overdefined results don't // get cached globally, which in turn means we will often try to rediscover // the same overdefined result again and again. Once something like // PredicateInfo is used in LVI or CVP, we should be able to make the // overdefined cache global, and remove this throttle. if (processedCount > MaxProcessedPerValue) { LLVM_DEBUG( dbgs() << "Giving up on stack because we are getting too deep\n"); // Fill in the original values while (!StartingStack.empty()) { std::pair &e = StartingStack.back(); TheCache.insertResult(e.second, e.first, ValueLatticeElement::getOverdefined()); StartingStack.pop_back(); } BlockValueSet.clear(); BlockValueStack.clear(); return; } std::pair e = BlockValueStack.back(); assert(BlockValueSet.count(e) && "Stack value should be in BlockValueSet!"); if (solveBlockValue(e.second, e.first)) { // The work item was completely processed. assert(BlockValueStack.back() == e && "Nothing should have been pushed!"); #ifndef NDEBUG std::optional BBLV = TheCache.getCachedValueInfo(e.second, e.first); assert(BBLV && "Result should be in cache!"); LLVM_DEBUG( dbgs() << "POP " << *e.second << " in " << e.first->getName() << " = " << *BBLV << "\n"); #endif BlockValueStack.pop_back(); BlockValueSet.erase(e); } else { // More work needs to be done before revisiting. assert(BlockValueStack.back() != e && "Stack should have been pushed!"); } } } std::optional LazyValueInfoImpl::getBlockValue(Value *Val, BasicBlock *BB, Instruction *CxtI) { // If already a constant, there is nothing to compute. if (Constant *VC = dyn_cast(Val)) return ValueLatticeElement::get(VC); if (std::optional OptLatticeVal = TheCache.getCachedValueInfo(Val, BB)) { intersectAssumeOrGuardBlockValueConstantRange(Val, *OptLatticeVal, CxtI); return OptLatticeVal; } // We have hit a cycle, assume overdefined. if (!pushBlockValue({ BB, Val })) return ValueLatticeElement::getOverdefined(); // Yet to be resolved. return std::nullopt; } static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) { switch (BBI->getOpcode()) { default: break; case Instruction::Load: case Instruction::Call: case Instruction::Invoke: if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range)) if (isa(BBI->getType())) { return ValueLatticeElement::getRange( getConstantRangeFromMetadata(*Ranges)); } break; }; // Nothing known - will be intersected with other facts return ValueLatticeElement::getOverdefined(); } bool LazyValueInfoImpl::solveBlockValue(Value *Val, BasicBlock *BB) { assert(!isa(Val) && "Value should not be constant"); assert(!TheCache.getCachedValueInfo(Val, BB) && "Value should not be in cache"); // Hold off inserting this value into the Cache in case we have to return // false and come back later. std::optional Res = solveBlockValueImpl(Val, BB); if (!Res) // Work pushed, will revisit return false; TheCache.insertResult(Val, BB, *Res); return true; } std::optional LazyValueInfoImpl::solveBlockValueImpl(Value *Val, BasicBlock *BB) { Instruction *BBI = dyn_cast(Val); if (!BBI || BBI->getParent() != BB) return solveBlockValueNonLocal(Val, BB); if (PHINode *PN = dyn_cast(BBI)) return solveBlockValuePHINode(PN, BB); if (auto *SI = dyn_cast(BBI)) return solveBlockValueSelect(SI, BB); // If this value is a nonnull pointer, record it's range and bailout. Note // that for all other pointer typed values, we terminate the search at the // definition. We could easily extend this to look through geps, bitcasts, // and the like to prove non-nullness, but it's not clear that's worth it // compile time wise. The context-insensitive value walk done inside // isKnownNonZero gets most of the profitable cases at much less expense. // This does mean that we have a sensitivity to where the defining // instruction is placed, even if it could legally be hoisted much higher. // That is unfortunate. PointerType *PT = dyn_cast(BBI->getType()); if (PT && isKnownNonZero(BBI, DL)) return ValueLatticeElement::getNot(ConstantPointerNull::get(PT)); if (BBI->getType()->isIntegerTy()) { if (auto *CI = dyn_cast(BBI)) return solveBlockValueCast(CI, BB); if (BinaryOperator *BO = dyn_cast(BBI)) return solveBlockValueBinaryOp(BO, BB); if (auto *EVI = dyn_cast(BBI)) return solveBlockValueExtractValue(EVI, BB); if (auto *II = dyn_cast(BBI)) return solveBlockValueIntrinsic(II, BB); } LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - unknown inst def found.\n"); return getFromRangeMetadata(BBI); } static void AddNonNullPointer(Value *Ptr, NonNullPointerSet &PtrSet) { // TODO: Use NullPointerIsDefined instead. if (Ptr->getType()->getPointerAddressSpace() == 0) PtrSet.insert(getUnderlyingObject(Ptr)); } static void AddNonNullPointersByInstruction( Instruction *I, NonNullPointerSet &PtrSet) { if (LoadInst *L = dyn_cast(I)) { AddNonNullPointer(L->getPointerOperand(), PtrSet); } else if (StoreInst *S = dyn_cast(I)) { AddNonNullPointer(S->getPointerOperand(), PtrSet); } else if (MemIntrinsic *MI = dyn_cast(I)) { if (MI->isVolatile()) return; // FIXME: check whether it has a valuerange that excludes zero? ConstantInt *Len = dyn_cast(MI->getLength()); if (!Len || Len->isZero()) return; AddNonNullPointer(MI->getRawDest(), PtrSet); if (MemTransferInst *MTI = dyn_cast(MI)) AddNonNullPointer(MTI->getRawSource(), PtrSet); } } bool LazyValueInfoImpl::isNonNullAtEndOfBlock(Value *Val, BasicBlock *BB) { if (NullPointerIsDefined(BB->getParent(), Val->getType()->getPointerAddressSpace())) return false; Val = Val->stripInBoundsOffsets(); return TheCache.isNonNullAtEndOfBlock(Val, BB, [](BasicBlock *BB) { NonNullPointerSet NonNullPointers; for (Instruction &I : *BB) AddNonNullPointersByInstruction(&I, NonNullPointers); return NonNullPointers; }); } std::optional LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) { ValueLatticeElement Result; // Start Undefined. // If this is the entry block, we must be asking about an argument. The // value is overdefined. if (BB->isEntryBlock()) { assert(isa(Val) && "Unknown live-in to the entry block"); return ValueLatticeElement::getOverdefined(); } // Loop over all of our predecessors, merging what we know from them into // result. If we encounter an unexplored predecessor, we eagerly explore it // in a depth first manner. In practice, this has the effect of discovering // paths we can't analyze eagerly without spending compile times analyzing // other paths. This heuristic benefits from the fact that predecessors are // frequently arranged such that dominating ones come first and we quickly // find a path to function entry. TODO: We should consider explicitly // canonicalizing to make this true rather than relying on this happy // accident. for (BasicBlock *Pred : predecessors(BB)) { std::optional EdgeResult = getEdgeValue(Val, Pred, BB); if (!EdgeResult) // Explore that input, then return here return std::nullopt; Result.mergeIn(*EdgeResult); // If we hit overdefined, exit early. The BlockVals entry is already set // to overdefined. if (Result.isOverdefined()) { LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because of pred '" << Pred->getName() << "' (non local).\n"); return Result; } } // Return the merged value, which is more precise than 'overdefined'. assert(!Result.isOverdefined()); return Result; } std::optional LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) { ValueLatticeElement Result; // Start Undefined. // Loop over all of our predecessors, merging what we know from them into // result. See the comment about the chosen traversal order in // solveBlockValueNonLocal; the same reasoning applies here. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *PhiBB = PN->getIncomingBlock(i); Value *PhiVal = PN->getIncomingValue(i); // Note that we can provide PN as the context value to getEdgeValue, even // though the results will be cached, because PN is the value being used as // the cache key in the caller. std::optional EdgeResult = getEdgeValue(PhiVal, PhiBB, BB, PN); if (!EdgeResult) // Explore that input, then return here return std::nullopt; Result.mergeIn(*EdgeResult); // If we hit overdefined, exit early. The BlockVals entry is already set // to overdefined. if (Result.isOverdefined()) { LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because of pred (local).\n"); return Result; } } // Return the merged value, which is more precise than 'overdefined'. assert(!Result.isOverdefined() && "Possible PHI in entry block?"); return Result; } static ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest = true); // If we can determine a constraint on the value given conditions assumed by // the program, intersect those constraints with BBLV void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange( Value *Val, ValueLatticeElement &BBLV, Instruction *BBI) { BBI = BBI ? BBI : dyn_cast(Val); if (!BBI) return; BasicBlock *BB = BBI->getParent(); for (auto &AssumeVH : AC->assumptionsFor(Val)) { if (!AssumeVH) continue; // Only check assumes in the block of the context instruction. Other // assumes will have already been taken into account when the value was // propagated from predecessor blocks. auto *I = cast(AssumeVH); if (I->getParent() != BB || !isValidAssumeForContext(I, BBI)) continue; BBLV = intersect(BBLV, getValueFromCondition(Val, I->getArgOperand(0))); } // If guards are not used in the module, don't spend time looking for them if (GuardDecl && !GuardDecl->use_empty() && BBI->getIterator() != BB->begin()) { for (Instruction &I : make_range(std::next(BBI->getIterator().getReverse()), BB->rend())) { Value *Cond = nullptr; if (match(&I, m_Intrinsic(m_Value(Cond)))) BBLV = intersect(BBLV, getValueFromCondition(Val, Cond)); } } if (BBLV.isOverdefined()) { // Check whether we're checking at the terminator, and the pointer has // been dereferenced in this block. PointerType *PTy = dyn_cast(Val->getType()); if (PTy && BB->getTerminator() == BBI && isNonNullAtEndOfBlock(Val, BB)) BBLV = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy)); } } static ConstantRange getConstantRangeOrFull(const ValueLatticeElement &Val, Type *Ty, const DataLayout &DL) { if (Val.isConstantRange()) return Val.getConstantRange(); return ConstantRange::getFull(DL.getTypeSizeInBits(Ty)); } std::optional LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) { // Recurse on our inputs if needed std::optional OptTrueVal = getBlockValue(SI->getTrueValue(), BB, SI); if (!OptTrueVal) return std::nullopt; ValueLatticeElement &TrueVal = *OptTrueVal; std::optional OptFalseVal = getBlockValue(SI->getFalseValue(), BB, SI); if (!OptFalseVal) return std::nullopt; ValueLatticeElement &FalseVal = *OptFalseVal; if (TrueVal.isConstantRange() || FalseVal.isConstantRange()) { const ConstantRange &TrueCR = getConstantRangeOrFull(TrueVal, SI->getType(), DL); const ConstantRange &FalseCR = getConstantRangeOrFull(FalseVal, SI->getType(), DL); Value *LHS = nullptr; Value *RHS = nullptr; SelectPatternResult SPR = matchSelectPattern(SI, LHS, RHS); // Is this a min specifically of our two inputs? (Avoid the risk of // ValueTracking getting smarter looking back past our immediate inputs.) if (SelectPatternResult::isMinOrMax(SPR.Flavor) && ((LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) || (RHS == SI->getTrueValue() && LHS == SI->getFalseValue()))) { ConstantRange ResultCR = [&]() { switch (SPR.Flavor) { default: llvm_unreachable("unexpected minmax type!"); case SPF_SMIN: /// Signed minimum return TrueCR.smin(FalseCR); case SPF_UMIN: /// Unsigned minimum return TrueCR.umin(FalseCR); case SPF_SMAX: /// Signed maximum return TrueCR.smax(FalseCR); case SPF_UMAX: /// Unsigned maximum return TrueCR.umax(FalseCR); }; }(); return ValueLatticeElement::getRange( ResultCR, TrueVal.isConstantRangeIncludingUndef() || FalseVal.isConstantRangeIncludingUndef()); } if (SPR.Flavor == SPF_ABS) { if (LHS == SI->getTrueValue()) return ValueLatticeElement::getRange( TrueCR.abs(), TrueVal.isConstantRangeIncludingUndef()); if (LHS == SI->getFalseValue()) return ValueLatticeElement::getRange( FalseCR.abs(), FalseVal.isConstantRangeIncludingUndef()); } if (SPR.Flavor == SPF_NABS) { ConstantRange Zero(APInt::getZero(TrueCR.getBitWidth())); if (LHS == SI->getTrueValue()) return ValueLatticeElement::getRange( Zero.sub(TrueCR.abs()), FalseVal.isConstantRangeIncludingUndef()); if (LHS == SI->getFalseValue()) return ValueLatticeElement::getRange( Zero.sub(FalseCR.abs()), FalseVal.isConstantRangeIncludingUndef()); } } // Can we constrain the facts about the true and false values by using the // condition itself? This shows up with idioms like e.g. select(a > 5, a, 5). // TODO: We could potentially refine an overdefined true value above. Value *Cond = SI->getCondition(); // If the value is undef, a different value may be chosen in // the select condition. if (isGuaranteedNotToBeUndefOrPoison(Cond, AC)) { TrueVal = intersect(TrueVal, getValueFromCondition(SI->getTrueValue(), Cond, true)); FalseVal = intersect( FalseVal, getValueFromCondition(SI->getFalseValue(), Cond, false)); } ValueLatticeElement Result = TrueVal; Result.mergeIn(FalseVal); return Result; } std::optional LazyValueInfoImpl::getRangeFor(Value *V, Instruction *CxtI, BasicBlock *BB) { std::optional OptVal = getBlockValue(V, BB, CxtI); if (!OptVal) return std::nullopt; return getConstantRangeOrFull(*OptVal, V->getType(), DL); } std::optional LazyValueInfoImpl::solveBlockValueCast(CastInst *CI, BasicBlock *BB) { // Without knowing how wide the input is, we can't analyze it in any useful // way. if (!CI->getOperand(0)->getType()->isSized()) return ValueLatticeElement::getOverdefined(); // Filter out casts we don't know how to reason about before attempting to // recurse on our operand. This can cut a long search short if we know we're // not going to be able to get any useful information anways. switch (CI->getOpcode()) { case Instruction::Trunc: case Instruction::SExt: case Instruction::ZExt: case Instruction::BitCast: break; default: // Unhandled instructions are overdefined. LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined (unknown cast).\n"); return ValueLatticeElement::getOverdefined(); } // Figure out the range of the LHS. If that fails, we still apply the // transfer rule on the full set since we may be able to locally infer // interesting facts. std::optional LHSRes = getRangeFor(CI->getOperand(0), CI, BB); if (!LHSRes) // More work to do before applying this transfer rule. return std::nullopt; const ConstantRange &LHSRange = *LHSRes; const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth(); // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze // more definitions. return ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(), ResultBitWidth)); } std::optional LazyValueInfoImpl::solveBlockValueBinaryOpImpl( Instruction *I, BasicBlock *BB, std::function OpFn) { // Figure out the ranges of the operands. If that fails, use a // conservative range, but apply the transfer rule anyways. This // lets us pick up facts from expressions like "and i32 (call i32 // @foo()), 32" std::optional LHSRes = getRangeFor(I->getOperand(0), I, BB); std::optional RHSRes = getRangeFor(I->getOperand(1), I, BB); if (!LHSRes || !RHSRes) // More work to do before applying this transfer rule. return std::nullopt; const ConstantRange &LHSRange = *LHSRes; const ConstantRange &RHSRange = *RHSRes; return ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange)); } std::optional LazyValueInfoImpl::solveBlockValueBinaryOp(BinaryOperator *BO, BasicBlock *BB) { assert(BO->getOperand(0)->getType()->isSized() && "all operands to binary operators are sized"); if (auto *OBO = dyn_cast(BO)) { unsigned NoWrapKind = 0; if (OBO->hasNoUnsignedWrap()) NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap; if (OBO->hasNoSignedWrap()) NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap; return solveBlockValueBinaryOpImpl( BO, BB, [BO, NoWrapKind](const ConstantRange &CR1, const ConstantRange &CR2) { return CR1.overflowingBinaryOp(BO->getOpcode(), CR2, NoWrapKind); }); } return solveBlockValueBinaryOpImpl( BO, BB, [BO](const ConstantRange &CR1, const ConstantRange &CR2) { return CR1.binaryOp(BO->getOpcode(), CR2); }); } std::optional LazyValueInfoImpl::solveBlockValueOverflowIntrinsic(WithOverflowInst *WO, BasicBlock *BB) { return solveBlockValueBinaryOpImpl( WO, BB, [WO](const ConstantRange &CR1, const ConstantRange &CR2) { return CR1.binaryOp(WO->getBinaryOp(), CR2); }); } std::optional LazyValueInfoImpl::solveBlockValueIntrinsic(IntrinsicInst *II, BasicBlock *BB) { ValueLatticeElement MetadataVal = getFromRangeMetadata(II); if (!ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - unknown intrinsic.\n"); return MetadataVal; } SmallVector OpRanges; for (Value *Op : II->args()) { std::optional Range = getRangeFor(Op, II, BB); if (!Range) return std::nullopt; OpRanges.push_back(*Range); } return intersect(ValueLatticeElement::getRange(ConstantRange::intrinsic( II->getIntrinsicID(), OpRanges)), MetadataVal); } std::optional LazyValueInfoImpl::solveBlockValueExtractValue(ExtractValueInst *EVI, BasicBlock *BB) { if (auto *WO = dyn_cast(EVI->getAggregateOperand())) if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 0) return solveBlockValueOverflowIntrinsic(WO, BB); // Handle extractvalue of insertvalue to allow further simplification // based on replaced with.overflow intrinsics. if (Value *V = simplifyExtractValueInst( EVI->getAggregateOperand(), EVI->getIndices(), EVI->getModule()->getDataLayout())) return getBlockValue(V, BB, EVI); LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined (unknown extractvalue).\n"); return ValueLatticeElement::getOverdefined(); } static bool matchICmpOperand(APInt &Offset, Value *LHS, Value *Val, ICmpInst::Predicate Pred) { if (LHS == Val) return true; // Handle range checking idiom produced by InstCombine. We will subtract the // offset from the allowed range for RHS in this case. const APInt *C; if (match(LHS, m_Add(m_Specific(Val), m_APInt(C)))) { Offset = *C; return true; } // Handle the symmetric case. This appears in saturation patterns like // (x == 16) ? 16 : (x + 1). if (match(Val, m_Add(m_Specific(LHS), m_APInt(C)))) { Offset = -*C; return true; } // If (x | y) < C, then (x < C) && (y < C). if (match(LHS, m_c_Or(m_Specific(Val), m_Value())) && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE)) return true; // If (x & y) > C, then (x > C) && (y > C). if (match(LHS, m_c_And(m_Specific(Val), m_Value())) && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)) return true; return false; } /// Get value range for a "(Val + Offset) Pred RHS" condition. static ValueLatticeElement getValueFromSimpleICmpCondition( CmpInst::Predicate Pred, Value *RHS, const APInt &Offset) { ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(), /*isFullSet=*/true); if (ConstantInt *CI = dyn_cast(RHS)) RHSRange = ConstantRange(CI->getValue()); else if (Instruction *I = dyn_cast(RHS)) if (auto *Ranges = I->getMetadata(LLVMContext::MD_range)) RHSRange = getConstantRangeFromMetadata(*Ranges); ConstantRange TrueValues = ConstantRange::makeAllowedICmpRegion(Pred, RHSRange); return ValueLatticeElement::getRange(TrueValues.subtract(Offset)); } static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI, bool isTrueDest) { Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); // Get the predicate that must hold along the considered edge. CmpInst::Predicate EdgePred = isTrueDest ? ICI->getPredicate() : ICI->getInversePredicate(); if (isa(RHS)) { if (ICI->isEquality() && LHS == Val) { if (EdgePred == ICmpInst::ICMP_EQ) return ValueLatticeElement::get(cast(RHS)); else if (!isa(RHS)) return ValueLatticeElement::getNot(cast(RHS)); } } Type *Ty = Val->getType(); if (!Ty->isIntegerTy()) return ValueLatticeElement::getOverdefined(); unsigned BitWidth = Ty->getScalarSizeInBits(); APInt Offset(BitWidth, 0); if (matchICmpOperand(Offset, LHS, Val, EdgePred)) return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset); CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(EdgePred); if (matchICmpOperand(Offset, RHS, Val, SwappedPred)) return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset); const APInt *Mask, *C; if (match(LHS, m_And(m_Specific(Val), m_APInt(Mask))) && match(RHS, m_APInt(C))) { // If (Val & Mask) == C then all the masked bits are known and we can // compute a value range based on that. if (EdgePred == ICmpInst::ICMP_EQ) { KnownBits Known; Known.Zero = ~*C & *Mask; Known.One = *C & *Mask; return ValueLatticeElement::getRange( ConstantRange::fromKnownBits(Known, /*IsSigned*/ false)); } // If (Val & Mask) != 0 then the value must be larger than the lowest set // bit of Mask. if (EdgePred == ICmpInst::ICMP_NE && !Mask->isZero() && C->isZero()) { return ValueLatticeElement::getRange(ConstantRange::getNonEmpty( APInt::getOneBitSet(BitWidth, Mask->countr_zero()), APInt::getZero(BitWidth))); } } // If (X urem Modulus) >= C, then X >= C. // If trunc X >= C, then X >= C. // TODO: An upper bound could be computed as well. if (match(LHS, m_CombineOr(m_URem(m_Specific(Val), m_Value()), m_Trunc(m_Specific(Val)))) && match(RHS, m_APInt(C))) { // Use the icmp region so we don't have to deal with different predicates. ConstantRange CR = ConstantRange::makeExactICmpRegion(EdgePred, *C); if (!CR.isEmptySet()) return ValueLatticeElement::getRange(ConstantRange::getNonEmpty( CR.getUnsignedMin().zext(BitWidth), APInt(BitWidth, 0))); } return ValueLatticeElement::getOverdefined(); } // Handle conditions of the form // extractvalue(op.with.overflow(%x, C), 1). static ValueLatticeElement getValueFromOverflowCondition( Value *Val, WithOverflowInst *WO, bool IsTrueDest) { // TODO: This only works with a constant RHS for now. We could also compute // the range of the RHS, but this doesn't fit into the current structure of // the edge value calculation. const APInt *C; if (WO->getLHS() != Val || !match(WO->getRHS(), m_APInt(C))) return ValueLatticeElement::getOverdefined(); // Calculate the possible values of %x for which no overflow occurs. ConstantRange NWR = ConstantRange::makeExactNoWrapRegion( WO->getBinaryOp(), *C, WO->getNoWrapKind()); // If overflow is false, %x is constrained to NWR. If overflow is true, %x is // constrained to it's inverse (all values that might cause overflow). if (IsTrueDest) NWR = NWR.inverse(); return ValueLatticeElement::getRange(NWR); } // Tracks a Value * condition and whether we're interested in it or its inverse typedef PointerIntPair CondValue; static std::optional getValueFromConditionImpl( Value *Val, CondValue CondVal, bool isRevisit, SmallDenseMap &Visited, SmallVectorImpl &Worklist) { Value *Cond = CondVal.getPointer(); bool isTrueDest = CondVal.getInt(); if (!isRevisit) { if (ICmpInst *ICI = dyn_cast(Cond)) return getValueFromICmpCondition(Val, ICI, isTrueDest); if (auto *EVI = dyn_cast(Cond)) if (auto *WO = dyn_cast(EVI->getAggregateOperand())) if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 1) return getValueFromOverflowCondition(Val, WO, isTrueDest); } Value *N; if (match(Cond, m_Not(m_Value(N)))) { CondValue NKey(N, !isTrueDest); auto NV = Visited.find(NKey); if (NV == Visited.end()) { Worklist.push_back(NKey); return std::nullopt; } return NV->second; } Value *L, *R; bool IsAnd; if (match(Cond, m_LogicalAnd(m_Value(L), m_Value(R)))) IsAnd = true; else if (match(Cond, m_LogicalOr(m_Value(L), m_Value(R)))) IsAnd = false; else return ValueLatticeElement::getOverdefined(); auto LV = Visited.find(CondValue(L, isTrueDest)); auto RV = Visited.find(CondValue(R, isTrueDest)); // if (L && R) -> intersect L and R // if (!(L || R)) -> intersect !L and !R // if (L || R) -> union L and R // if (!(L && R)) -> union !L and !R if ((isTrueDest ^ IsAnd) && (LV != Visited.end())) { ValueLatticeElement V = LV->second; if (V.isOverdefined()) return V; if (RV != Visited.end()) { V.mergeIn(RV->second); return V; } } if (LV == Visited.end() || RV == Visited.end()) { assert(!isRevisit); if (LV == Visited.end()) Worklist.push_back(CondValue(L, isTrueDest)); if (RV == Visited.end()) Worklist.push_back(CondValue(R, isTrueDest)); return std::nullopt; } return intersect(LV->second, RV->second); } ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest) { assert(Cond && "precondition"); SmallDenseMap Visited; SmallVector Worklist; CondValue CondKey(Cond, isTrueDest); Worklist.push_back(CondKey); do { CondValue CurrentCond = Worklist.back(); // Insert an Overdefined placeholder into the set to prevent // infinite recursion if there exists IRs that use not // dominated by its def as in this example: // "%tmp3 = or i1 undef, %tmp4" // "%tmp4 = or i1 undef, %tmp3" auto Iter = Visited.try_emplace(CurrentCond, ValueLatticeElement::getOverdefined()); bool isRevisit = !Iter.second; std::optional Result = getValueFromConditionImpl( Val, CurrentCond, isRevisit, Visited, Worklist); if (Result) { Visited[CurrentCond] = *Result; Worklist.pop_back(); } } while (!Worklist.empty()); auto Result = Visited.find(CondKey); assert(Result != Visited.end()); return Result->second; } // Return true if Usr has Op as an operand, otherwise false. static bool usesOperand(User *Usr, Value *Op) { return is_contained(Usr->operands(), Op); } // Return true if the instruction type of Val is supported by // constantFoldUser(). Currently CastInst, BinaryOperator and FreezeInst only. // Call this before calling constantFoldUser() to find out if it's even worth // attempting to call it. static bool isOperationFoldable(User *Usr) { return isa(Usr) || isa(Usr) || isa(Usr); } // Check if Usr can be simplified to an integer constant when the value of one // of its operands Op is an integer constant OpConstVal. If so, return it as an // lattice value range with a single element or otherwise return an overdefined // lattice value. static ValueLatticeElement constantFoldUser(User *Usr, Value *Op, const APInt &OpConstVal, const DataLayout &DL) { assert(isOperationFoldable(Usr) && "Precondition"); Constant* OpConst = Constant::getIntegerValue(Op->getType(), OpConstVal); // Check if Usr can be simplified to a constant. if (auto *CI = dyn_cast(Usr)) { assert(CI->getOperand(0) == Op && "Operand 0 isn't Op"); if (auto *C = dyn_cast_or_null( simplifyCastInst(CI->getOpcode(), OpConst, CI->getDestTy(), DL))) { return ValueLatticeElement::getRange(ConstantRange(C->getValue())); } } else if (auto *BO = dyn_cast(Usr)) { bool Op0Match = BO->getOperand(0) == Op; bool Op1Match = BO->getOperand(1) == Op; assert((Op0Match || Op1Match) && "Operand 0 nor Operand 1 isn't a match"); Value *LHS = Op0Match ? OpConst : BO->getOperand(0); Value *RHS = Op1Match ? OpConst : BO->getOperand(1); if (auto *C = dyn_cast_or_null( simplifyBinOp(BO->getOpcode(), LHS, RHS, DL))) { return ValueLatticeElement::getRange(ConstantRange(C->getValue())); } } else if (isa(Usr)) { assert(cast(Usr)->getOperand(0) == Op && "Operand 0 isn't Op"); return ValueLatticeElement::getRange(ConstantRange(OpConstVal)); } return ValueLatticeElement::getOverdefined(); } /// Compute the value of Val on the edge BBFrom -> BBTo. Returns false if /// Val is not constrained on the edge. Result is unspecified if return value /// is false. static std::optional getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, BasicBlock *BBTo) { // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we // know that v != 0. if (BranchInst *BI = dyn_cast(BBFrom->getTerminator())) { // If this is a conditional branch and only one successor goes to BBTo, then // we may be able to infer something from the condition. if (BI->isConditional() && BI->getSuccessor(0) != BI->getSuccessor(1)) { bool isTrueDest = BI->getSuccessor(0) == BBTo; assert(BI->getSuccessor(!isTrueDest) == BBTo && "BBTo isn't a successor of BBFrom"); Value *Condition = BI->getCondition(); // If V is the condition of the branch itself, then we know exactly what // it is. if (Condition == Val) return ValueLatticeElement::get(ConstantInt::get( Type::getInt1Ty(Val->getContext()), isTrueDest)); // If the condition of the branch is an equality comparison, we may be // able to infer the value. ValueLatticeElement Result = getValueFromCondition(Val, Condition, isTrueDest); if (!Result.isOverdefined()) return Result; if (User *Usr = dyn_cast(Val)) { assert(Result.isOverdefined() && "Result isn't overdefined"); // Check with isOperationFoldable() first to avoid linearly iterating // over the operands unnecessarily which can be expensive for // instructions with many operands. if (isa(Usr->getType()) && isOperationFoldable(Usr)) { const DataLayout &DL = BBTo->getModule()->getDataLayout(); if (usesOperand(Usr, Condition)) { // If Val has Condition as an operand and Val can be folded into a // constant with either Condition == true or Condition == false, // propagate the constant. // eg. // ; %Val is true on the edge to %then. // %Val = and i1 %Condition, true. // br %Condition, label %then, label %else APInt ConditionVal(1, isTrueDest ? 1 : 0); Result = constantFoldUser(Usr, Condition, ConditionVal, DL); } else { // If one of Val's operand has an inferred value, we may be able to // infer the value of Val. // eg. // ; %Val is 94 on the edge to %then. // %Val = add i8 %Op, 1 // %Condition = icmp eq i8 %Op, 93 // br i1 %Condition, label %then, label %else for (unsigned i = 0; i < Usr->getNumOperands(); ++i) { Value *Op = Usr->getOperand(i); ValueLatticeElement OpLatticeVal = getValueFromCondition(Op, Condition, isTrueDest); if (std::optional OpConst = OpLatticeVal.asConstantInteger()) { Result = constantFoldUser(Usr, Op, *OpConst, DL); break; } } } } } if (!Result.isOverdefined()) return Result; } } // If the edge was formed by a switch on the value, then we may know exactly // what it is. if (SwitchInst *SI = dyn_cast(BBFrom->getTerminator())) { Value *Condition = SI->getCondition(); if (!isa(Val->getType())) return std::nullopt; bool ValUsesConditionAndMayBeFoldable = false; if (Condition != Val) { // Check if Val has Condition as an operand. if (User *Usr = dyn_cast(Val)) ValUsesConditionAndMayBeFoldable = isOperationFoldable(Usr) && usesOperand(Usr, Condition); if (!ValUsesConditionAndMayBeFoldable) return std::nullopt; } assert((Condition == Val || ValUsesConditionAndMayBeFoldable) && "Condition != Val nor Val doesn't use Condition"); bool DefaultCase = SI->getDefaultDest() == BBTo; unsigned BitWidth = Val->getType()->getIntegerBitWidth(); ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/); for (auto Case : SI->cases()) { APInt CaseValue = Case.getCaseValue()->getValue(); ConstantRange EdgeVal(CaseValue); if (ValUsesConditionAndMayBeFoldable) { User *Usr = cast(Val); const DataLayout &DL = BBTo->getModule()->getDataLayout(); ValueLatticeElement EdgeLatticeVal = constantFoldUser(Usr, Condition, CaseValue, DL); if (EdgeLatticeVal.isOverdefined()) return std::nullopt; EdgeVal = EdgeLatticeVal.getConstantRange(); } if (DefaultCase) { // It is possible that the default destination is the destination of // some cases. We cannot perform difference for those cases. // We know Condition != CaseValue in BBTo. In some cases we can use // this to infer Val == f(Condition) is != f(CaseValue). For now, we // only do this when f is identity (i.e. Val == Condition), but we // should be able to do this for any injective f. if (Case.getCaseSuccessor() != BBTo && Condition == Val) EdgesVals = EdgesVals.difference(EdgeVal); } else if (Case.getCaseSuccessor() == BBTo) EdgesVals = EdgesVals.unionWith(EdgeVal); } return ValueLatticeElement::getRange(std::move(EdgesVals)); } return std::nullopt; } /// Compute the value of Val on the edge BBFrom -> BBTo or the value at /// the basic block if the edge does not constrain Val. std::optional LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom, BasicBlock *BBTo, Instruction *CxtI) { // If already a constant, there is nothing to compute. if (Constant *VC = dyn_cast(Val)) return ValueLatticeElement::get(VC); ValueLatticeElement LocalResult = getEdgeValueLocal(Val, BBFrom, BBTo) .value_or(ValueLatticeElement::getOverdefined()); if (hasSingleValue(LocalResult)) // Can't get any more precise here return LocalResult; std::optional OptInBlock = getBlockValue(Val, BBFrom, BBFrom->getTerminator()); if (!OptInBlock) return std::nullopt; ValueLatticeElement &InBlock = *OptInBlock; // We can use the context instruction (generically the ultimate instruction // the calling pass is trying to simplify) here, even though the result of // this function is generally cached when called from the solve* functions // (and that cached result might be used with queries using a different // context instruction), because when this function is called from the solve* // functions, the context instruction is not provided. When called from // LazyValueInfoImpl::getValueOnEdge, the context instruction is provided, // but then the result is not cached. intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock, CxtI); return intersect(LocalResult, InBlock); } ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB, Instruction *CxtI) { LLVM_DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '" << BB->getName() << "'\n"); assert(BlockValueStack.empty() && BlockValueSet.empty()); std::optional OptResult = getBlockValue(V, BB, CxtI); if (!OptResult) { solve(); OptResult = getBlockValue(V, BB, CxtI); assert(OptResult && "Value not available after solving"); } ValueLatticeElement Result = *OptResult; LLVM_DEBUG(dbgs() << " Result = " << Result << "\n"); return Result; } ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) { LLVM_DEBUG(dbgs() << "LVI Getting value " << *V << " at '" << CxtI->getName() << "'\n"); if (auto *C = dyn_cast(V)) return ValueLatticeElement::get(C); ValueLatticeElement Result = ValueLatticeElement::getOverdefined(); if (auto *I = dyn_cast(V)) Result = getFromRangeMetadata(I); intersectAssumeOrGuardBlockValueConstantRange(V, Result, CxtI); LLVM_DEBUG(dbgs() << " Result = " << Result << "\n"); return Result; } ValueLatticeElement LazyValueInfoImpl:: getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI) { LLVM_DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '" << FromBB->getName() << "' to '" << ToBB->getName() << "'\n"); std::optional Result = getEdgeValue(V, FromBB, ToBB, CxtI); if (!Result) { solve(); Result = getEdgeValue(V, FromBB, ToBB, CxtI); assert(Result && "More work to do after problem solved?"); } LLVM_DEBUG(dbgs() << " Result = " << *Result << "\n"); return *Result; } void LazyValueInfoImpl::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc) { TheCache.threadEdgeImpl(OldSucc, NewSucc); } //===----------------------------------------------------------------------===// // LazyValueInfo Impl //===----------------------------------------------------------------------===// /// This lazily constructs the LazyValueInfoImpl. static LazyValueInfoImpl &getImpl(void *&PImpl, AssumptionCache *AC, const Module *M) { if (!PImpl) { assert(M && "getCache() called with a null Module"); const DataLayout &DL = M->getDataLayout(); Function *GuardDecl = M->getFunction( Intrinsic::getName(Intrinsic::experimental_guard)); PImpl = new LazyValueInfoImpl(AC, DL, GuardDecl); } return *static_cast(PImpl); } bool LazyValueInfoWrapperPass::runOnFunction(Function &F) { Info.AC = &getAnalysis().getAssumptionCache(F); Info.TLI = &getAnalysis().getTLI(F); if (Info.PImpl) getImpl(Info.PImpl, Info.AC, F.getParent()).clear(); // Fully lazy. return false; } void LazyValueInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); AU.addRequired(); } LazyValueInfo &LazyValueInfoWrapperPass::getLVI() { return Info; } LazyValueInfo::~LazyValueInfo() { releaseMemory(); } void LazyValueInfo::releaseMemory() { // If the cache was allocated, free it. if (PImpl) { delete &getImpl(PImpl, AC, nullptr); PImpl = nullptr; } } bool LazyValueInfo::invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv) { // We need to invalidate if we have either failed to preserve this analyses // result directly or if any of its dependencies have been invalidated. auto PAC = PA.getChecker(); if (!(PAC.preserved() || PAC.preservedSet>())) return true; return false; } void LazyValueInfoWrapperPass::releaseMemory() { Info.releaseMemory(); } LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { auto &AC = FAM.getResult(F); auto &TLI = FAM.getResult(F); return LazyValueInfo(&AC, &F.getParent()->getDataLayout(), &TLI); } /// Returns true if we can statically tell that this value will never be a /// "useful" constant. In practice, this means we've got something like an /// alloca or a malloc call for which a comparison against a constant can /// only be guarding dead code. Note that we are potentially giving up some /// precision in dead code (a constant result) in favour of avoiding a /// expensive search for a easily answered common query. static bool isKnownNonConstant(Value *V) { V = V->stripPointerCasts(); // The return val of alloc cannot be a Constant. if (isa(V)) return true; return false; } Constant *LazyValueInfo::getConstant(Value *V, Instruction *CxtI) { // Bail out early if V is known not to be a Constant. if (isKnownNonConstant(V)) return nullptr; BasicBlock *BB = CxtI->getParent(); ValueLatticeElement Result = getImpl(PImpl, AC, BB->getModule()).getValueInBlock(V, BB, CxtI); if (Result.isConstant()) return Result.getConstant(); if (Result.isConstantRange()) { const ConstantRange &CR = Result.getConstantRange(); if (const APInt *SingleVal = CR.getSingleElement()) return ConstantInt::get(V->getContext(), *SingleVal); } return nullptr; } ConstantRange LazyValueInfo::getConstantRange(Value *V, Instruction *CxtI, bool UndefAllowed) { assert(V->getType()->isIntegerTy()); unsigned Width = V->getType()->getIntegerBitWidth(); BasicBlock *BB = CxtI->getParent(); ValueLatticeElement Result = getImpl(PImpl, AC, BB->getModule()).getValueInBlock(V, BB, CxtI); if (Result.isUnknown()) return ConstantRange::getEmpty(Width); if (Result.isConstantRange(UndefAllowed)) return Result.getConstantRange(UndefAllowed); // We represent ConstantInt constants as constant ranges but other kinds // of integer constants, i.e. ConstantExpr will be tagged as constants assert(!(Result.isConstant() && isa(Result.getConstant())) && "ConstantInt value must be represented as constantrange"); return ConstantRange::getFull(Width); } ConstantRange LazyValueInfo::getConstantRangeAtUse(const Use &U, bool UndefAllowed) { Value *V = U.get(); ConstantRange CR = getConstantRange(V, cast(U.getUser()), UndefAllowed); // Check whether the only (possibly transitive) use of the value is in a // position where V can be constrained by a select or branch condition. const Use *CurrU = &U; // TODO: Increase limit? const unsigned MaxUsesToInspect = 3; for (unsigned I = 0; I < MaxUsesToInspect; ++I) { std::optional CondVal; auto *CurrI = cast(CurrU->getUser()); if (auto *SI = dyn_cast(CurrI)) { // If the value is undef, a different value may be chosen in // the select condition and at use. if (!isGuaranteedNotToBeUndefOrPoison(SI->getCondition(), AC)) break; if (CurrU->getOperandNo() == 1) CondVal = getValueFromCondition(V, SI->getCondition(), true); else if (CurrU->getOperandNo() == 2) CondVal = getValueFromCondition(V, SI->getCondition(), false); } else if (auto *PHI = dyn_cast(CurrI)) { // TODO: Use non-local query? CondVal = getEdgeValueLocal(V, PHI->getIncomingBlock(*CurrU), PHI->getParent()); } if (CondVal && CondVal->isConstantRange()) CR = CR.intersectWith(CondVal->getConstantRange()); // Only follow one-use chain, to allow direct intersection of conditions. // If there are multiple uses, we would have to intersect with the union of // all conditions at different uses. // Stop walking if we hit a non-speculatable instruction. Even if the // result is only used under a specific condition, executing the // instruction itself may cause side effects or UB already. // This also disallows looking through phi nodes: If the phi node is part // of a cycle, we might end up reasoning about values from different cycle // iterations (PR60629). if (!CurrI->hasOneUse() || !isSafeToSpeculativelyExecute(CurrI)) break; CurrU = &*CurrI->use_begin(); } return CR; } /// Determine whether the specified value is known to be a /// constant on the specified edge. Return null if not. Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI) { Module *M = FromBB->getModule(); ValueLatticeElement Result = getImpl(PImpl, AC, M).getValueOnEdge(V, FromBB, ToBB, CxtI); if (Result.isConstant()) return Result.getConstant(); if (Result.isConstantRange()) { const ConstantRange &CR = Result.getConstantRange(); if (const APInt *SingleVal = CR.getSingleElement()) return ConstantInt::get(V->getContext(), *SingleVal); } return nullptr; } ConstantRange LazyValueInfo::getConstantRangeOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI) { unsigned Width = V->getType()->getIntegerBitWidth(); Module *M = FromBB->getModule(); ValueLatticeElement Result = getImpl(PImpl, AC, M).getValueOnEdge(V, FromBB, ToBB, CxtI); if (Result.isUnknown()) return ConstantRange::getEmpty(Width); if (Result.isConstantRange()) return Result.getConstantRange(); // We represent ConstantInt constants as constant ranges but other kinds // of integer constants, i.e. ConstantExpr will be tagged as constants assert(!(Result.isConstant() && isa(Result.getConstant())) && "ConstantInt value must be represented as constantrange"); return ConstantRange::getFull(Width); } static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C, const ValueLatticeElement &Val, const DataLayout &DL, TargetLibraryInfo *TLI) { // If we know the value is a constant, evaluate the conditional. Constant *Res = nullptr; if (Val.isConstant()) { Res = ConstantFoldCompareInstOperands(Pred, Val.getConstant(), C, DL, TLI); if (ConstantInt *ResCI = dyn_cast_or_null(Res)) return ResCI->isZero() ? LazyValueInfo::False : LazyValueInfo::True; return LazyValueInfo::Unknown; } if (Val.isConstantRange()) { ConstantInt *CI = dyn_cast(C); if (!CI) return LazyValueInfo::Unknown; const ConstantRange &CR = Val.getConstantRange(); if (Pred == ICmpInst::ICMP_EQ) { if (!CR.contains(CI->getValue())) return LazyValueInfo::False; if (CR.isSingleElement()) return LazyValueInfo::True; } else if (Pred == ICmpInst::ICMP_NE) { if (!CR.contains(CI->getValue())) return LazyValueInfo::True; if (CR.isSingleElement()) return LazyValueInfo::False; } else { // Handle more complex predicates. ConstantRange TrueValues = ConstantRange::makeExactICmpRegion( (ICmpInst::Predicate)Pred, CI->getValue()); if (TrueValues.contains(CR)) return LazyValueInfo::True; if (TrueValues.inverse().contains(CR)) return LazyValueInfo::False; } return LazyValueInfo::Unknown; } if (Val.isNotConstant()) { // If this is an equality comparison, we can try to fold it knowing that // "V != C1". if (Pred == ICmpInst::ICMP_EQ) { // !C1 == C -> false iff C1 == C. Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE, Val.getNotConstant(), C, DL, TLI); if (Res && Res->isNullValue()) return LazyValueInfo::False; } else if (Pred == ICmpInst::ICMP_NE) { // !C1 != C -> true iff C1 == C. Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE, Val.getNotConstant(), C, DL, TLI); if (Res && Res->isNullValue()) return LazyValueInfo::True; } return LazyValueInfo::Unknown; } return LazyValueInfo::Unknown; } /// Determine whether the specified value comparison with a constant is known to /// be true or false on the specified CFG edge. Pred is a CmpInst predicate. LazyValueInfo::Tristate LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI) { Module *M = FromBB->getModule(); ValueLatticeElement Result = getImpl(PImpl, AC, M).getValueOnEdge(V, FromBB, ToBB, CxtI); return getPredicateResult(Pred, C, Result, M->getDataLayout(), TLI); } LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C, Instruction *CxtI, bool UseBlockValue) { // Is or is not NonNull are common predicates being queried. If // isKnownNonZero can tell us the result of the predicate, we can // return it quickly. But this is only a fastpath, and falling // through would still be correct. Module *M = CxtI->getModule(); const DataLayout &DL = M->getDataLayout(); if (V->getType()->isPointerTy() && C->isNullValue() && isKnownNonZero(V->stripPointerCastsSameRepresentation(), DL)) { if (Pred == ICmpInst::ICMP_EQ) return LazyValueInfo::False; else if (Pred == ICmpInst::ICMP_NE) return LazyValueInfo::True; } ValueLatticeElement Result = UseBlockValue ? getImpl(PImpl, AC, M).getValueInBlock(V, CxtI->getParent(), CxtI) : getImpl(PImpl, AC, M).getValueAt(V, CxtI); Tristate Ret = getPredicateResult(Pred, C, Result, DL, TLI); if (Ret != Unknown) return Ret; // Note: The following bit of code is somewhat distinct from the rest of LVI; // LVI as a whole tries to compute a lattice value which is conservatively // correct at a given location. In this case, we have a predicate which we // weren't able to prove about the merged result, and we're pushing that // predicate back along each incoming edge to see if we can prove it // separately for each input. As a motivating example, consider: // bb1: // %v1 = ... ; constantrange<1, 5> // br label %merge // bb2: // %v2 = ... ; constantrange<10, 20> // br label %merge // merge: // %phi = phi [%v1, %v2] ; constantrange<1,20> // %pred = icmp eq i32 %phi, 8 // We can't tell from the lattice value for '%phi' that '%pred' is false // along each path, but by checking the predicate over each input separately, // we can. // We limit the search to one step backwards from the current BB and value. // We could consider extending this to search further backwards through the // CFG and/or value graph, but there are non-obvious compile time vs quality // tradeoffs. BasicBlock *BB = CxtI->getParent(); // Function entry or an unreachable block. Bail to avoid confusing // analysis below. pred_iterator PI = pred_begin(BB), PE = pred_end(BB); if (PI == PE) return Unknown; // If V is a PHI node in the same block as the context, we need to ask // questions about the predicate as applied to the incoming value along // each edge. This is useful for eliminating cases where the predicate is // known along all incoming edges. if (auto *PHI = dyn_cast(V)) if (PHI->getParent() == BB) { Tristate Baseline = Unknown; for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) { Value *Incoming = PHI->getIncomingValue(i); BasicBlock *PredBB = PHI->getIncomingBlock(i); // Note that PredBB may be BB itself. Tristate Result = getPredicateOnEdge(Pred, Incoming, C, PredBB, BB, CxtI); // Keep going as long as we've seen a consistent known result for // all inputs. Baseline = (i == 0) ? Result /* First iteration */ : (Baseline == Result ? Baseline : Unknown); /* All others */ if (Baseline == Unknown) break; } if (Baseline != Unknown) return Baseline; } // For a comparison where the V is outside this block, it's possible // that we've branched on it before. Look to see if the value is known // on all incoming edges. if (!isa(V) || cast(V)->getParent() != BB) { // For predecessor edge, determine if the comparison is true or false // on that edge. If they're all true or all false, we can conclude // the value of the comparison in this block. Tristate Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI); if (Baseline != Unknown) { // Check that all remaining incoming values match the first one. while (++PI != PE) { Tristate Ret = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI); if (Ret != Baseline) break; } // If we terminated early, then one of the values didn't match. if (PI == PE) { return Baseline; } } } return Unknown; } LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(unsigned P, Value *LHS, Value *RHS, Instruction *CxtI, bool UseBlockValue) { CmpInst::Predicate Pred = (CmpInst::Predicate)P; if (auto *C = dyn_cast(RHS)) return getPredicateAt(P, LHS, C, CxtI, UseBlockValue); if (auto *C = dyn_cast(LHS)) return getPredicateAt(CmpInst::getSwappedPredicate(Pred), RHS, C, CxtI, UseBlockValue); // Got two non-Constant values. Try to determine the comparison results based // on the block values of the two operands, e.g. because they have // non-overlapping ranges. if (UseBlockValue) { Module *M = CxtI->getModule(); ValueLatticeElement L = getImpl(PImpl, AC, M).getValueInBlock(LHS, CxtI->getParent(), CxtI); if (L.isOverdefined()) return LazyValueInfo::Unknown; ValueLatticeElement R = getImpl(PImpl, AC, M).getValueInBlock(RHS, CxtI->getParent(), CxtI); Type *Ty = CmpInst::makeCmpResultType(LHS->getType()); if (Constant *Res = L.getCompare((CmpInst::Predicate)P, Ty, R, M->getDataLayout())) { if (Res->isNullValue()) return LazyValueInfo::False; if (Res->isOneValue()) return LazyValueInfo::True; } } return LazyValueInfo::Unknown; } void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc) { if (PImpl) { getImpl(PImpl, AC, PredBB->getModule()) .threadEdge(PredBB, OldSucc, NewSucc); } } +void LazyValueInfo::forgetValue(Value *V) { + if (PImpl) + getImpl(PImpl, AC, nullptr).forgetValue(V); +} + void LazyValueInfo::eraseBlock(BasicBlock *BB) { if (PImpl) { getImpl(PImpl, AC, BB->getModule()).eraseBlock(BB); } } void LazyValueInfo::clear(const Module *M) { if (PImpl) { getImpl(PImpl, AC, M).clear(); } } void LazyValueInfo::printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) { if (PImpl) { getImpl(PImpl, AC, F.getParent()).printLVI(F, DTree, OS); } } // Print the LVI for the function arguments at the start of each basic block. void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot( const BasicBlock *BB, formatted_raw_ostream &OS) { // Find if there are latticevalues defined for arguments of the function. auto *F = BB->getParent(); for (const auto &Arg : F->args()) { ValueLatticeElement Result = LVIImpl->getValueInBlock( const_cast(&Arg), const_cast(BB)); if (Result.isUnknown()) continue; OS << "; LatticeVal for: '" << Arg << "' is: " << Result << "\n"; } } // This function prints the LVI analysis for the instruction I at the beginning // of various basic blocks. It relies on calculated values that are stored in // the LazyValueInfoCache, and in the absence of cached values, recalculate the // LazyValueInfo for `I`, and print that info. void LazyValueInfoAnnotatedWriter::emitInstructionAnnot( const Instruction *I, formatted_raw_ostream &OS) { auto *ParentBB = I->getParent(); SmallPtrSet BlocksContainingLVI; // We can generate (solve) LVI values only for blocks that are dominated by // the I's parent. However, to avoid generating LVI for all dominating blocks, // that contain redundant/uninteresting information, we print LVI for // blocks that may use this LVI information (such as immediate successor // blocks, and blocks that contain uses of `I`). auto printResult = [&](const BasicBlock *BB) { if (!BlocksContainingLVI.insert(BB).second) return; ValueLatticeElement Result = LVIImpl->getValueInBlock( const_cast(I), const_cast(BB)); OS << "; LatticeVal for: '" << *I << "' in BB: '"; BB->printAsOperand(OS, false); OS << "' is: " << Result << "\n"; }; printResult(ParentBB); // Print the LVI analysis results for the immediate successor blocks, that // are dominated by `ParentBB`. for (const auto *BBSucc : successors(ParentBB)) if (DT.dominates(ParentBB, BBSucc)) printResult(BBSucc); // Print LVI in blocks where `I` is used. for (const auto *U : I->users()) if (auto *UseI = dyn_cast(U)) if (!isa(UseI) || DT.dominates(ParentBB, UseI->getParent())) printResult(UseI->getParent()); } namespace { // Printer class for LazyValueInfo results. class LazyValueInfoPrinter : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid LazyValueInfoPrinter() : FunctionPass(ID) { initializeLazyValueInfoPrinterPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); AU.addRequired(); AU.addRequired(); } // Get the mandatory dominator tree analysis and pass this in to the // LVIPrinter. We cannot rely on the LVI's DT, since it's optional. bool runOnFunction(Function &F) override { dbgs() << "LVI for function '" << F.getName() << "':\n"; auto &LVI = getAnalysis().getLVI(); auto &DTree = getAnalysis().getDomTree(); LVI.printLVI(F, DTree, dbgs()); return false; } }; } char LazyValueInfoPrinter::ID = 0; INITIALIZE_PASS_BEGIN(LazyValueInfoPrinter, "print-lazy-value-info", "Lazy Value Info Printer Pass", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) INITIALIZE_PASS_END(LazyValueInfoPrinter, "print-lazy-value-info", "Lazy Value Info Printer Pass", false, false) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 111d4d30aab9..39ab48b4a48e 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -1,15330 +1,15330 @@ //===- ScalarEvolution.cpp - Scalar Evolution Analysis --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains the implementation of the scalar evolution analysis // engine, which is used primarily to analyze expressions involving induction // variables in loops. // // There are several aspects to this library. First is the representation of // scalar expressions, which are represented as subclasses of the SCEV class. // These classes are used to represent certain types of subexpressions that we // can handle. We only create one SCEV of a particular shape, so // pointer-comparisons for equality are legal. // // One important aspect of the SCEV objects is that they are never cyclic, even // if there is a cycle in the dataflow for an expression (ie, a PHI node). If // the PHI node is one of the idioms that we can represent (e.g., a polynomial // recurrence) then we represent it directly as a recurrence node, otherwise we // represent it as a SCEVUnknown node. // // In addition to being able to represent expressions of various types, we also // have folders that are used to build the *canonical* representation for a // particular expression. These folders are capable of using a variety of // rewrite rules to simplify the expressions. // // Once the folders are defined, we can implement the more interesting // higher-level code, such as the code that recognizes PHI nodes of various // types, computes the execution count of a loop, etc. // // TODO: We should use these routines and value representations to implement // dependence analysis! // //===----------------------------------------------------------------------===// // // There are several good references for the techniques used in this analysis. // // Chains of recurrences -- a method to expedite the evaluation // of closed-form functions // Olaf Bachmann, Paul S. Wang, Eugene V. Zima // // On computational properties of chains of recurrences // Eugene V. Zima // // Symbolic Evaluation of Chains of Recurrences for Loop Optimization // Robert A. van Engelen // // Efficient Symbolic Analysis for Optimizing Compilers // Robert A. van Engelen // // Using the chains of recurrences algebra for data dependence testing and // induction variable substitution // MS Thesis, Johnie Birch // //===----------------------------------------------------------------------===// #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/SaveAndRestore.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include #include #include #include #include #include #include #include using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "scalar-evolution" STATISTIC(NumExitCountsComputed, "Number of loop exits with predictable exit counts"); STATISTIC(NumExitCountsNotComputed, "Number of loop exits without predictable exit counts"); STATISTIC(NumBruteForceTripCountsComputed, "Number of loops with trip counts computed by force"); #ifdef EXPENSIVE_CHECKS bool llvm::VerifySCEV = true; #else bool llvm::VerifySCEV = false; #endif static cl::opt MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden, cl::desc("Maximum number of iterations SCEV will " "symbolically execute a constant " "derived loop"), cl::init(100)); static cl::opt VerifySCEVOpt( "verify-scev", cl::Hidden, cl::location(VerifySCEV), cl::desc("Verify ScalarEvolution's backedge taken counts (slow)")); static cl::opt VerifySCEVStrict( "verify-scev-strict", cl::Hidden, cl::desc("Enable stricter verification with -verify-scev is passed")); static cl::opt VerifyIR( "scev-verify-ir", cl::Hidden, cl::desc("Verify IR correctness when making sensitive SCEV queries (slow)"), cl::init(false)); static cl::opt MulOpsInlineThreshold( "scev-mulops-inline-threshold", cl::Hidden, cl::desc("Threshold for inlining multiplication operands into a SCEV"), cl::init(32)); static cl::opt AddOpsInlineThreshold( "scev-addops-inline-threshold", cl::Hidden, cl::desc("Threshold for inlining addition operands into a SCEV"), cl::init(500)); static cl::opt MaxSCEVCompareDepth( "scalar-evolution-max-scev-compare-depth", cl::Hidden, cl::desc("Maximum depth of recursive SCEV complexity comparisons"), cl::init(32)); static cl::opt MaxSCEVOperationsImplicationDepth( "scalar-evolution-max-scev-operations-implication-depth", cl::Hidden, cl::desc("Maximum depth of recursive SCEV operations implication analysis"), cl::init(2)); static cl::opt MaxValueCompareDepth( "scalar-evolution-max-value-compare-depth", cl::Hidden, cl::desc("Maximum depth of recursive value complexity comparisons"), cl::init(2)); static cl::opt MaxArithDepth("scalar-evolution-max-arith-depth", cl::Hidden, cl::desc("Maximum depth of recursive arithmetics"), cl::init(32)); static cl::opt MaxConstantEvolvingDepth( "scalar-evolution-max-constant-evolving-depth", cl::Hidden, cl::desc("Maximum depth of recursive constant evolving"), cl::init(32)); static cl::opt MaxCastDepth("scalar-evolution-max-cast-depth", cl::Hidden, cl::desc("Maximum depth of recursive SExt/ZExt/Trunc"), cl::init(8)); static cl::opt MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden, cl::desc("Max coefficients in AddRec during evolving"), cl::init(8)); static cl::opt HugeExprThreshold("scalar-evolution-huge-expr-threshold", cl::Hidden, cl::desc("Size of the expression which is considered huge"), cl::init(4096)); static cl::opt RangeIterThreshold( "scev-range-iter-threshold", cl::Hidden, cl::desc("Threshold for switching to iteratively computing SCEV ranges"), cl::init(32)); static cl::opt ClassifyExpressions("scalar-evolution-classify-expressions", cl::Hidden, cl::init(true), cl::desc("When printing analysis, include information on every instruction")); static cl::opt UseExpensiveRangeSharpening( "scalar-evolution-use-expensive-range-sharpening", cl::Hidden, cl::init(false), cl::desc("Use more powerful methods of sharpening expression ranges. May " "be costly in terms of compile time")); static cl::opt MaxPhiSCCAnalysisSize( "scalar-evolution-max-scc-analysis-depth", cl::Hidden, cl::desc("Maximum amount of nodes to process while searching SCEVUnknown " "Phi strongly connected components"), cl::init(8)); static cl::opt EnableFiniteLoopControl("scalar-evolution-finite-loop", cl::Hidden, cl::desc("Handle <= and >= in finite loops"), cl::init(true)); static cl::opt UseContextForNoWrapFlagInference( "scalar-evolution-use-context-for-no-wrap-flag-strenghening", cl::Hidden, cl::desc("Infer nuw/nsw flags using context where suitable"), cl::init(true)); //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Implementation of the SCEV class. // #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SCEV::dump() const { print(dbgs()); dbgs() << '\n'; } #endif void SCEV::print(raw_ostream &OS) const { switch (getSCEVType()) { case scConstant: cast(this)->getValue()->printAsOperand(OS, false); return; case scVScale: OS << "vscale"; return; case scPtrToInt: { const SCEVPtrToIntExpr *PtrToInt = cast(this); const SCEV *Op = PtrToInt->getOperand(); OS << "(ptrtoint " << *Op->getType() << " " << *Op << " to " << *PtrToInt->getType() << ")"; return; } case scTruncate: { const SCEVTruncateExpr *Trunc = cast(this); const SCEV *Op = Trunc->getOperand(); OS << "(trunc " << *Op->getType() << " " << *Op << " to " << *Trunc->getType() << ")"; return; } case scZeroExtend: { const SCEVZeroExtendExpr *ZExt = cast(this); const SCEV *Op = ZExt->getOperand(); OS << "(zext " << *Op->getType() << " " << *Op << " to " << *ZExt->getType() << ")"; return; } case scSignExtend: { const SCEVSignExtendExpr *SExt = cast(this); const SCEV *Op = SExt->getOperand(); OS << "(sext " << *Op->getType() << " " << *Op << " to " << *SExt->getType() << ")"; return; } case scAddRecExpr: { const SCEVAddRecExpr *AR = cast(this); OS << "{" << *AR->getOperand(0); for (unsigned i = 1, e = AR->getNumOperands(); i != e; ++i) OS << ",+," << *AR->getOperand(i); OS << "}<"; if (AR->hasNoUnsignedWrap()) OS << "nuw><"; if (AR->hasNoSignedWrap()) OS << "nsw><"; if (AR->hasNoSelfWrap() && !AR->getNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW))) OS << "nw><"; AR->getLoop()->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ">"; return; } case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: { const SCEVNAryExpr *NAry = cast(this); const char *OpStr = nullptr; switch (NAry->getSCEVType()) { case scAddExpr: OpStr = " + "; break; case scMulExpr: OpStr = " * "; break; case scUMaxExpr: OpStr = " umax "; break; case scSMaxExpr: OpStr = " smax "; break; case scUMinExpr: OpStr = " umin "; break; case scSMinExpr: OpStr = " smin "; break; case scSequentialUMinExpr: OpStr = " umin_seq "; break; default: llvm_unreachable("There are no other nary expression types."); } OS << "("; ListSeparator LS(OpStr); for (const SCEV *Op : NAry->operands()) OS << LS << *Op; OS << ")"; switch (NAry->getSCEVType()) { case scAddExpr: case scMulExpr: if (NAry->hasNoUnsignedWrap()) OS << ""; if (NAry->hasNoSignedWrap()) OS << ""; break; default: // Nothing to print for other nary expressions. break; } return; } case scUDivExpr: { const SCEVUDivExpr *UDiv = cast(this); OS << "(" << *UDiv->getLHS() << " /u " << *UDiv->getRHS() << ")"; return; } case scUnknown: cast(this)->getValue()->printAsOperand(OS, false); return; case scCouldNotCompute: OS << "***COULDNOTCOMPUTE***"; return; } llvm_unreachable("Unknown SCEV kind!"); } Type *SCEV::getType() const { switch (getSCEVType()) { case scConstant: return cast(this)->getType(); case scVScale: return cast(this)->getType(); case scPtrToInt: case scTruncate: case scZeroExtend: case scSignExtend: return cast(this)->getType(); case scAddRecExpr: return cast(this)->getType(); case scMulExpr: return cast(this)->getType(); case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: return cast(this)->getType(); case scSequentialUMinExpr: return cast(this)->getType(); case scAddExpr: return cast(this)->getType(); case scUDivExpr: return cast(this)->getType(); case scUnknown: return cast(this)->getType(); case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } ArrayRef SCEV::operands() const { switch (getSCEVType()) { case scConstant: case scVScale: case scUnknown: return {}; case scPtrToInt: case scTruncate: case scZeroExtend: case scSignExtend: return cast(this)->operands(); case scAddRecExpr: case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: return cast(this)->operands(); case scUDivExpr: return cast(this)->operands(); case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } bool SCEV::isZero() const { if (const SCEVConstant *SC = dyn_cast(this)) return SC->getValue()->isZero(); return false; } bool SCEV::isOne() const { if (const SCEVConstant *SC = dyn_cast(this)) return SC->getValue()->isOne(); return false; } bool SCEV::isAllOnesValue() const { if (const SCEVConstant *SC = dyn_cast(this)) return SC->getValue()->isMinusOne(); return false; } bool SCEV::isNonConstantNegative() const { const SCEVMulExpr *Mul = dyn_cast(this); if (!Mul) return false; // If there is a constant factor, it will be first. const SCEVConstant *SC = dyn_cast(Mul->getOperand(0)); if (!SC) return false; // Return true if the value is negative, this matches things like (-42 * V). return SC->getAPInt().isNegative(); } SCEVCouldNotCompute::SCEVCouldNotCompute() : SCEV(FoldingSetNodeIDRef(), scCouldNotCompute, 0) {} bool SCEVCouldNotCompute::classof(const SCEV *S) { return S->getSCEVType() == scCouldNotCompute; } const SCEV *ScalarEvolution::getConstant(ConstantInt *V) { FoldingSetNodeID ID; ID.AddInteger(scConstant); ID.AddPointer(V); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVConstant(ID.Intern(SCEVAllocator), V); UniqueSCEVs.InsertNode(S, IP); return S; } const SCEV *ScalarEvolution::getConstant(const APInt &Val) { return getConstant(ConstantInt::get(getContext(), Val)); } const SCEV * ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) { IntegerType *ITy = cast(getEffectiveSCEVType(Ty)); return getConstant(ConstantInt::get(ITy, V, isSigned)); } const SCEV *ScalarEvolution::getVScale(Type *Ty) { FoldingSetNodeID ID; ID.AddInteger(scVScale); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVVScale(ID.Intern(SCEVAllocator), Ty); UniqueSCEVs.InsertNode(S, IP); return S; } SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy, const SCEV *op, Type *ty) : SCEV(ID, SCEVTy, computeExpressionSize(op)), Op(op), Ty(ty) {} SCEVPtrToIntExpr::SCEVPtrToIntExpr(const FoldingSetNodeIDRef ID, const SCEV *Op, Type *ITy) : SCEVCastExpr(ID, scPtrToInt, Op, ITy) { assert(getOperand()->getType()->isPointerTy() && Ty->isIntegerTy() && "Must be a non-bit-width-changing pointer-to-integer cast!"); } SCEVIntegralCastExpr::SCEVIntegralCastExpr(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy, const SCEV *op, Type *ty) : SCEVCastExpr(ID, SCEVTy, op, ty) {} SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID, const SCEV *op, Type *ty) : SCEVIntegralCastExpr(ID, scTruncate, op, ty) { assert(getOperand()->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot truncate non-integer value!"); } SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID, const SCEV *op, Type *ty) : SCEVIntegralCastExpr(ID, scZeroExtend, op, ty) { assert(getOperand()->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot zero extend non-integer value!"); } SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID, const SCEV *op, Type *ty) : SCEVIntegralCastExpr(ID, scSignExtend, op, ty) { assert(getOperand()->getType()->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot sign extend non-integer value!"); } void SCEVUnknown::deleted() { // Clear this SCEVUnknown from various maps. SE->forgetMemoizedResults(this); // Remove this SCEVUnknown from the uniquing map. SE->UniqueSCEVs.RemoveNode(this); // Release the value. setValPtr(nullptr); } void SCEVUnknown::allUsesReplacedWith(Value *New) { // Clear this SCEVUnknown from various maps. SE->forgetMemoizedResults(this); // Remove this SCEVUnknown from the uniquing map. SE->UniqueSCEVs.RemoveNode(this); // Replace the value pointer in case someone is still using this SCEVUnknown. setValPtr(New); } //===----------------------------------------------------------------------===// // SCEV Utilities //===----------------------------------------------------------------------===// /// Compare the two values \p LV and \p RV in terms of their "complexity" where /// "complexity" is a partial (and somewhat ad-hoc) relation used to order /// operands in SCEV expressions. \p EqCache is a set of pairs of values that /// have been previously deemed to be "equally complex" by this routine. It is /// intended to avoid exponential time complexity in cases like: /// /// %a = f(%x, %y) /// %b = f(%a, %a) /// %c = f(%b, %b) /// /// %d = f(%x, %y) /// %e = f(%d, %d) /// %f = f(%e, %e) /// /// CompareValueComplexity(%f, %c) /// /// Since we do not continue running this routine on expression trees once we /// have seen unequal values, there is no need to track them in the cache. static int CompareValueComplexity(EquivalenceClasses &EqCacheValue, const LoopInfo *const LI, Value *LV, Value *RV, unsigned Depth) { if (Depth > MaxValueCompareDepth || EqCacheValue.isEquivalent(LV, RV)) return 0; // Order pointer values after integer values. This helps SCEVExpander form // GEPs. bool LIsPointer = LV->getType()->isPointerTy(), RIsPointer = RV->getType()->isPointerTy(); if (LIsPointer != RIsPointer) return (int)LIsPointer - (int)RIsPointer; // Compare getValueID values. unsigned LID = LV->getValueID(), RID = RV->getValueID(); if (LID != RID) return (int)LID - (int)RID; // Sort arguments by their position. if (const auto *LA = dyn_cast(LV)) { const auto *RA = cast(RV); unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo(); return (int)LArgNo - (int)RArgNo; } if (const auto *LGV = dyn_cast(LV)) { const auto *RGV = cast(RV); const auto IsGVNameSemantic = [&](const GlobalValue *GV) { auto LT = GV->getLinkage(); return !(GlobalValue::isPrivateLinkage(LT) || GlobalValue::isInternalLinkage(LT)); }; // Use the names to distinguish the two values, but only if the // names are semantically important. if (IsGVNameSemantic(LGV) && IsGVNameSemantic(RGV)) return LGV->getName().compare(RGV->getName()); } // For instructions, compare their loop depth, and their operand count. This // is pretty loose. if (const auto *LInst = dyn_cast(LV)) { const auto *RInst = cast(RV); // Compare loop depths. const BasicBlock *LParent = LInst->getParent(), *RParent = RInst->getParent(); if (LParent != RParent) { unsigned LDepth = LI->getLoopDepth(LParent), RDepth = LI->getLoopDepth(RParent); if (LDepth != RDepth) return (int)LDepth - (int)RDepth; } // Compare the number of operands. unsigned LNumOps = LInst->getNumOperands(), RNumOps = RInst->getNumOperands(); if (LNumOps != RNumOps) return (int)LNumOps - (int)RNumOps; for (unsigned Idx : seq(0u, LNumOps)) { int Result = CompareValueComplexity(EqCacheValue, LI, LInst->getOperand(Idx), RInst->getOperand(Idx), Depth + 1); if (Result != 0) return Result; } } EqCacheValue.unionSets(LV, RV); return 0; } // Return negative, zero, or positive, if LHS is less than, equal to, or greater // than RHS, respectively. A three-way result allows recursive comparisons to be // more efficient. // If the max analysis depth was reached, return std::nullopt, assuming we do // not know if they are equivalent for sure. static std::optional CompareSCEVComplexity(EquivalenceClasses &EqCacheSCEV, EquivalenceClasses &EqCacheValue, const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS, DominatorTree &DT, unsigned Depth = 0) { // Fast-path: SCEVs are uniqued so we can do a quick equality check. if (LHS == RHS) return 0; // Primarily, sort the SCEVs by their getSCEVType(). SCEVTypes LType = LHS->getSCEVType(), RType = RHS->getSCEVType(); if (LType != RType) return (int)LType - (int)RType; if (EqCacheSCEV.isEquivalent(LHS, RHS)) return 0; if (Depth > MaxSCEVCompareDepth) return std::nullopt; // Aside from the getSCEVType() ordering, the particular ordering // isn't very important except that it's beneficial to be consistent, // so that (a + b) and (b + a) don't end up as different expressions. switch (LType) { case scUnknown: { const SCEVUnknown *LU = cast(LHS); const SCEVUnknown *RU = cast(RHS); int X = CompareValueComplexity(EqCacheValue, LI, LU->getValue(), RU->getValue(), Depth + 1); if (X == 0) EqCacheSCEV.unionSets(LHS, RHS); return X; } case scConstant: { const SCEVConstant *LC = cast(LHS); const SCEVConstant *RC = cast(RHS); // Compare constant values. const APInt &LA = LC->getAPInt(); const APInt &RA = RC->getAPInt(); unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth(); if (LBitWidth != RBitWidth) return (int)LBitWidth - (int)RBitWidth; return LA.ult(RA) ? -1 : 1; } case scVScale: { const auto *LTy = cast(cast(LHS)->getType()); const auto *RTy = cast(cast(RHS)->getType()); return LTy->getBitWidth() - RTy->getBitWidth(); } case scAddRecExpr: { const SCEVAddRecExpr *LA = cast(LHS); const SCEVAddRecExpr *RA = cast(RHS); // There is always a dominance between two recs that are used by one SCEV, // so we can safely sort recs by loop header dominance. We require such // order in getAddExpr. const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop(); if (LLoop != RLoop) { const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader(); assert(LHead != RHead && "Two loops share the same header?"); if (DT.dominates(LHead, RHead)) return 1; assert(DT.dominates(RHead, LHead) && "No dominance between recurrences used by one SCEV?"); return -1; } [[fallthrough]]; } case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: case scAddExpr: case scMulExpr: case scUDivExpr: case scSMaxExpr: case scUMaxExpr: case scSMinExpr: case scUMinExpr: case scSequentialUMinExpr: { ArrayRef LOps = LHS->operands(); ArrayRef ROps = RHS->operands(); // Lexicographically compare n-ary-like expressions. unsigned LNumOps = LOps.size(), RNumOps = ROps.size(); if (LNumOps != RNumOps) return (int)LNumOps - (int)RNumOps; for (unsigned i = 0; i != LNumOps; ++i) { auto X = CompareSCEVComplexity(EqCacheSCEV, EqCacheValue, LI, LOps[i], ROps[i], DT, Depth + 1); if (X != 0) return X; } EqCacheSCEV.unionSets(LHS, RHS); return 0; } case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } /// Given a list of SCEV objects, order them by their complexity, and group /// objects of the same complexity together by value. When this routine is /// finished, we know that any duplicates in the vector are consecutive and that /// complexity is monotonically increasing. /// /// Note that we go take special precautions to ensure that we get deterministic /// results from this routine. In other words, we don't want the results of /// this to depend on where the addresses of various SCEV objects happened to /// land in memory. static void GroupByComplexity(SmallVectorImpl &Ops, LoopInfo *LI, DominatorTree &DT) { if (Ops.size() < 2) return; // Noop EquivalenceClasses EqCacheSCEV; EquivalenceClasses EqCacheValue; // Whether LHS has provably less complexity than RHS. auto IsLessComplex = [&](const SCEV *LHS, const SCEV *RHS) { auto Complexity = CompareSCEVComplexity(EqCacheSCEV, EqCacheValue, LI, LHS, RHS, DT); return Complexity && *Complexity < 0; }; if (Ops.size() == 2) { // This is the common case, which also happens to be trivially simple. // Special case it. const SCEV *&LHS = Ops[0], *&RHS = Ops[1]; if (IsLessComplex(RHS, LHS)) std::swap(LHS, RHS); return; } // Do the rough sort by complexity. llvm::stable_sort(Ops, [&](const SCEV *LHS, const SCEV *RHS) { return IsLessComplex(LHS, RHS); }); // Now that we are sorted by complexity, group elements of the same // complexity. Note that this is, at worst, N^2, but the vector is likely to // be extremely short in practice. Note that we take this approach because we // do not want to depend on the addresses of the objects we are grouping. for (unsigned i = 0, e = Ops.size(); i != e-2; ++i) { const SCEV *S = Ops[i]; unsigned Complexity = S->getSCEVType(); // If there are any objects of the same complexity and same value as this // one, group them. for (unsigned j = i+1; j != e && Ops[j]->getSCEVType() == Complexity; ++j) { if (Ops[j] == S) { // Found a duplicate. // Move it to immediately after i'th element. std::swap(Ops[i+1], Ops[j]); ++i; // no need to rescan it. if (i == e-2) return; // Done! } } } } /// Returns true if \p Ops contains a huge SCEV (the subtree of S contains at /// least HugeExprThreshold nodes). static bool hasHugeExpression(ArrayRef Ops) { return any_of(Ops, [](const SCEV *S) { return S->getExpressionSize() >= HugeExprThreshold; }); } //===----------------------------------------------------------------------===// // Simple SCEV method implementations //===----------------------------------------------------------------------===// /// Compute BC(It, K). The result has width W. Assume, K > 0. static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K, ScalarEvolution &SE, Type *ResultTy) { // Handle the simplest case efficiently. if (K == 1) return SE.getTruncateOrZeroExtend(It, ResultTy); // We are using the following formula for BC(It, K): // // BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / K! // // Suppose, W is the bitwidth of the return value. We must be prepared for // overflow. Hence, we must assure that the result of our computation is // equal to the accurate one modulo 2^W. Unfortunately, division isn't // safe in modular arithmetic. // // However, this code doesn't use exactly that formula; the formula it uses // is something like the following, where T is the number of factors of 2 in // K! (i.e. trailing zeros in the binary representation of K!), and ^ is // exponentiation: // // BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / 2^T / (K! / 2^T) // // This formula is trivially equivalent to the previous formula. However, // this formula can be implemented much more efficiently. The trick is that // K! / 2^T is odd, and exact division by an odd number *is* safe in modular // arithmetic. To do exact division in modular arithmetic, all we have // to do is multiply by the inverse. Therefore, this step can be done at // width W. // // The next issue is how to safely do the division by 2^T. The way this // is done is by doing the multiplication step at a width of at least W + T // bits. This way, the bottom W+T bits of the product are accurate. Then, // when we perform the division by 2^T (which is equivalent to a right shift // by T), the bottom W bits are accurate. Extra bits are okay; they'll get // truncated out after the division by 2^T. // // In comparison to just directly using the first formula, this technique // is much more efficient; using the first formula requires W * K bits, // but this formula less than W + K bits. Also, the first formula requires // a division step, whereas this formula only requires multiplies and shifts. // // It doesn't matter whether the subtraction step is done in the calculation // width or the input iteration count's width; if the subtraction overflows, // the result must be zero anyway. We prefer here to do it in the width of // the induction variable because it helps a lot for certain cases; CodeGen // isn't smart enough to ignore the overflow, which leads to much less // efficient code if the width of the subtraction is wider than the native // register width. // // (It's possible to not widen at all by pulling out factors of 2 before // the multiplication; for example, K=2 can be calculated as // It/2*(It+(It*INT_MIN/INT_MIN)+-1). However, it requires // extra arithmetic, so it's not an obvious win, and it gets // much more complicated for K > 3.) // Protection from insane SCEVs; this bound is conservative, // but it probably doesn't matter. if (K > 1000) return SE.getCouldNotCompute(); unsigned W = SE.getTypeSizeInBits(ResultTy); // Calculate K! / 2^T and T; we divide out the factors of two before // multiplying for calculating K! / 2^T to avoid overflow. // Other overflow doesn't matter because we only care about the bottom // W bits of the result. APInt OddFactorial(W, 1); unsigned T = 1; for (unsigned i = 3; i <= K; ++i) { APInt Mult(W, i); unsigned TwoFactors = Mult.countr_zero(); T += TwoFactors; Mult.lshrInPlace(TwoFactors); OddFactorial *= Mult; } // We need at least W + T bits for the multiplication step unsigned CalculationBits = W + T; // Calculate 2^T, at width T+W. APInt DivFactor = APInt::getOneBitSet(CalculationBits, T); // Calculate the multiplicative inverse of K! / 2^T; // this multiplication factor will perform the exact division by // K! / 2^T. APInt Mod = APInt::getSignedMinValue(W+1); APInt MultiplyFactor = OddFactorial.zext(W+1); MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod); MultiplyFactor = MultiplyFactor.trunc(W); // Calculate the product, at width T+W IntegerType *CalculationTy = IntegerType::get(SE.getContext(), CalculationBits); const SCEV *Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy); for (unsigned i = 1; i != K; ++i) { const SCEV *S = SE.getMinusSCEV(It, SE.getConstant(It->getType(), i)); Dividend = SE.getMulExpr(Dividend, SE.getTruncateOrZeroExtend(S, CalculationTy)); } // Divide by 2^T const SCEV *DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor)); // Truncate the result, and divide by K! / 2^T. return SE.getMulExpr(SE.getConstant(MultiplyFactor), SE.getTruncateOrZeroExtend(DivResult, ResultTy)); } /// Return the value of this chain of recurrences at the specified iteration /// number. We can evaluate this recurrence by multiplying each element in the /// chain by the binomial coefficient corresponding to it. In other words, we /// can evaluate {A,+,B,+,C,+,D} as: /// /// A*BC(It, 0) + B*BC(It, 1) + C*BC(It, 2) + D*BC(It, 3) /// /// where BC(It, k) stands for binomial coefficient. const SCEV *SCEVAddRecExpr::evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const { return evaluateAtIteration(operands(), It, SE); } const SCEV * SCEVAddRecExpr::evaluateAtIteration(ArrayRef Operands, const SCEV *It, ScalarEvolution &SE) { assert(Operands.size() > 0); const SCEV *Result = Operands[0]; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { // The computation is correct in the face of overflow provided that the // multiplication is performed _after_ the evaluation of the binomial // coefficient. const SCEV *Coeff = BinomialCoefficient(It, i, SE, Result->getType()); if (isa(Coeff)) return Coeff; Result = SE.getAddExpr(Result, SE.getMulExpr(Operands[i], Coeff)); } return Result; } //===----------------------------------------------------------------------===// // SCEV Expression folder implementations //===----------------------------------------------------------------------===// const SCEV *ScalarEvolution::getLosslessPtrToIntExpr(const SCEV *Op, unsigned Depth) { assert(Depth <= 1 && "getLosslessPtrToIntExpr() should self-recurse at most once."); // We could be called with an integer-typed operands during SCEV rewrites. // Since the operand is an integer already, just perform zext/trunc/self cast. if (!Op->getType()->isPointerTy()) return Op; // What would be an ID for such a SCEV cast expression? FoldingSetNodeID ID; ID.AddInteger(scPtrToInt); ID.AddPointer(Op); void *IP = nullptr; // Is there already an expression for such a cast? if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // It isn't legal for optimizations to construct new ptrtoint expressions // for non-integral pointers. if (getDataLayout().isNonIntegralPointerType(Op->getType())) return getCouldNotCompute(); Type *IntPtrTy = getDataLayout().getIntPtrType(Op->getType()); // We can only trivially model ptrtoint if SCEV's effective (integer) type // is sufficiently wide to represent all possible pointer values. // We could theoretically teach SCEV to truncate wider pointers, but // that isn't implemented for now. if (getDataLayout().getTypeSizeInBits(getEffectiveSCEVType(Op->getType())) != getDataLayout().getTypeSizeInBits(IntPtrTy)) return getCouldNotCompute(); // If not, is this expression something we can't reduce any further? if (auto *U = dyn_cast(Op)) { // Perform some basic constant folding. If the operand of the ptr2int cast // is a null pointer, don't create a ptr2int SCEV expression (that will be // left as-is), but produce a zero constant. // NOTE: We could handle a more general case, but lack motivational cases. if (isa(U->getValue())) return getZero(IntPtrTy); // Create an explicit cast node. // We can reuse the existing insert position since if we get here, // we won't have made any changes which would invalidate it. SCEV *S = new (SCEVAllocator) SCEVPtrToIntExpr(ID.Intern(SCEVAllocator), Op, IntPtrTy); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Op); return S; } assert(Depth == 0 && "getLosslessPtrToIntExpr() should not self-recurse for " "non-SCEVUnknown's."); // Otherwise, we've got some expression that is more complex than just a // single SCEVUnknown. But we don't want to have a SCEVPtrToIntExpr of an // arbitrary expression, we want to have SCEVPtrToIntExpr of an SCEVUnknown // only, and the expressions must otherwise be integer-typed. // So sink the cast down to the SCEVUnknown's. /// The SCEVPtrToIntSinkingRewriter takes a scalar evolution expression, /// which computes a pointer-typed value, and rewrites the whole expression /// tree so that *all* the computations are done on integers, and the only /// pointer-typed operands in the expression are SCEVUnknown. class SCEVPtrToIntSinkingRewriter : public SCEVRewriteVisitor { using Base = SCEVRewriteVisitor; public: SCEVPtrToIntSinkingRewriter(ScalarEvolution &SE) : SCEVRewriteVisitor(SE) {} static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE) { SCEVPtrToIntSinkingRewriter Rewriter(SE); return Rewriter.visit(Scev); } const SCEV *visit(const SCEV *S) { Type *STy = S->getType(); // If the expression is not pointer-typed, just keep it as-is. if (!STy->isPointerTy()) return S; // Else, recursively sink the cast down into it. return Base::visit(S); } const SCEV *visitAddExpr(const SCEVAddExpr *Expr) { SmallVector Operands; bool Changed = false; for (const auto *Op : Expr->operands()) { Operands.push_back(visit(Op)); Changed |= Op != Operands.back(); } return !Changed ? Expr : SE.getAddExpr(Operands, Expr->getNoWrapFlags()); } const SCEV *visitMulExpr(const SCEVMulExpr *Expr) { SmallVector Operands; bool Changed = false; for (const auto *Op : Expr->operands()) { Operands.push_back(visit(Op)); Changed |= Op != Operands.back(); } return !Changed ? Expr : SE.getMulExpr(Operands, Expr->getNoWrapFlags()); } const SCEV *visitUnknown(const SCEVUnknown *Expr) { assert(Expr->getType()->isPointerTy() && "Should only reach pointer-typed SCEVUnknown's."); return SE.getLosslessPtrToIntExpr(Expr, /*Depth=*/1); } }; // And actually perform the cast sinking. const SCEV *IntOp = SCEVPtrToIntSinkingRewriter::rewrite(Op, *this); assert(IntOp->getType()->isIntegerTy() && "We must have succeeded in sinking the cast, " "and ending up with an integer-typed expression!"); return IntOp; } const SCEV *ScalarEvolution::getPtrToIntExpr(const SCEV *Op, Type *Ty) { assert(Ty->isIntegerTy() && "Target type must be an integer type!"); const SCEV *IntOp = getLosslessPtrToIntExpr(Op); if (isa(IntOp)) return IntOp; return getTruncateOrZeroExtend(IntOp, Ty); } const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) && "This is not a truncating conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); assert(!Op->getType()->isPointerTy() && "Can't truncate pointer!"); Ty = getEffectiveSCEVType(Ty); FoldingSetNodeID ID; ID.AddInteger(scTruncate); ID.AddPointer(Op); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast(Op)) return getConstant( cast(ConstantExpr::getTrunc(SC->getValue(), Ty))); // trunc(trunc(x)) --> trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast(Op)) return getTruncateExpr(ST->getOperand(), Ty, Depth + 1); // trunc(sext(x)) --> sext(x) if widening or trunc(x) if narrowing if (const SCEVSignExtendExpr *SS = dyn_cast(Op)) return getTruncateOrSignExtend(SS->getOperand(), Ty, Depth + 1); // trunc(zext(x)) --> zext(x) if widening or trunc(x) if narrowing if (const SCEVZeroExtendExpr *SZ = dyn_cast(Op)) return getTruncateOrZeroExtend(SZ->getOperand(), Ty, Depth + 1); if (Depth > MaxCastDepth) { SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Op); return S; } // trunc(x1 + ... + xN) --> trunc(x1) + ... + trunc(xN) and // trunc(x1 * ... * xN) --> trunc(x1) * ... * trunc(xN), // if after transforming we have at most one truncate, not counting truncates // that replace other casts. if (isa(Op) || isa(Op)) { auto *CommOp = cast(Op); SmallVector Operands; unsigned numTruncs = 0; for (unsigned i = 0, e = CommOp->getNumOperands(); i != e && numTruncs < 2; ++i) { const SCEV *S = getTruncateExpr(CommOp->getOperand(i), Ty, Depth + 1); if (!isa(CommOp->getOperand(i)) && isa(S)) numTruncs++; Operands.push_back(S); } if (numTruncs < 2) { if (isa(Op)) return getAddExpr(Operands); if (isa(Op)) return getMulExpr(Operands); llvm_unreachable("Unexpected SCEV type for Op."); } // Although we checked in the beginning that ID is not in the cache, it is // possible that during recursion and different modification ID was inserted // into the cache. So if we find it, just return it. if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; } // If the input value is a chrec scev, truncate the chrec's operands. if (const SCEVAddRecExpr *AddRec = dyn_cast(Op)) { SmallVector Operands; for (const SCEV *Op : AddRec->operands()) Operands.push_back(getTruncateExpr(Op, Ty, Depth + 1)); return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap); } // Return zero if truncating to known zeros. uint32_t MinTrailingZeros = getMinTrailingZeros(Op); if (MinTrailingZeros >= getTypeSizeInBits(Ty)) return getZero(Ty); // The cast wasn't folded; create an explicit cast node. We can reuse // the existing insert position since if we get here, we won't have // made any changes which would invalidate it. SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Op); return S; } // Get the limit of a recurrence such that incrementing by Step cannot cause // signed overflow as long as the value of the recurrence within the // loop does not exceed this limit before incrementing. static const SCEV *getSignedOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { unsigned BitWidth = SE->getTypeSizeInBits(Step->getType()); if (SE->isKnownPositive(Step)) { *Pred = ICmpInst::ICMP_SLT; return SE->getConstant(APInt::getSignedMinValue(BitWidth) - SE->getSignedRangeMax(Step)); } if (SE->isKnownNegative(Step)) { *Pred = ICmpInst::ICMP_SGT; return SE->getConstant(APInt::getSignedMaxValue(BitWidth) - SE->getSignedRangeMin(Step)); } return nullptr; } // Get the limit of a recurrence such that incrementing by Step cannot cause // unsigned overflow as long as the value of the recurrence within the loop does // not exceed this limit before incrementing. static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { unsigned BitWidth = SE->getTypeSizeInBits(Step->getType()); *Pred = ICmpInst::ICMP_ULT; return SE->getConstant(APInt::getMinValue(BitWidth) - SE->getUnsignedRangeMax(Step)); } namespace { struct ExtendOpTraitsBase { typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *, unsigned); }; // Used to make code generic over signed and unsigned overflow. template struct ExtendOpTraits { // Members present: // // static const SCEV::NoWrapFlags WrapType; // // static const ExtendOpTraitsBase::GetExtendExprTy GetExtendExpr; // // static const SCEV *getOverflowLimitForStep(const SCEV *Step, // ICmpInst::Predicate *Pred, // ScalarEvolution *SE); }; template <> struct ExtendOpTraits : public ExtendOpTraitsBase { static const SCEV::NoWrapFlags WrapType = SCEV::FlagNSW; static const GetExtendExprTy GetExtendExpr; static const SCEV *getOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { return getSignedOverflowLimitForStep(Step, Pred, SE); } }; const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits< SCEVSignExtendExpr>::GetExtendExpr = &ScalarEvolution::getSignExtendExpr; template <> struct ExtendOpTraits : public ExtendOpTraitsBase { static const SCEV::NoWrapFlags WrapType = SCEV::FlagNUW; static const GetExtendExprTy GetExtendExpr; static const SCEV *getOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { return getUnsignedOverflowLimitForStep(Step, Pred, SE); } }; const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits< SCEVZeroExtendExpr>::GetExtendExpr = &ScalarEvolution::getZeroExtendExpr; } // end anonymous namespace // The recurrence AR has been shown to have no signed/unsigned wrap or something // close to it. Typically, if we can prove NSW/NUW for AR, then we can just as // easily prove NSW/NUW for its preincrement or postincrement sibling. This // allows normalizing a sign/zero extended AddRec as such: {sext/zext(Step + // Start),+,Step} => {(Step + sext/zext(Start),+,Step} As a result, the // expression "Step + sext/zext(PreIncAR)" is congruent with // "sext/zext(PostIncAR)" template static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, ScalarEvolution *SE, unsigned Depth) { auto WrapType = ExtendOpTraits::WrapType; auto GetExtendExpr = ExtendOpTraits::GetExtendExpr; const Loop *L = AR->getLoop(); const SCEV *Start = AR->getStart(); const SCEV *Step = AR->getStepRecurrence(*SE); // Check for a simple looking step prior to loop entry. const SCEVAddExpr *SA = dyn_cast(Start); if (!SA) return nullptr; // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV // subtraction is expensive. For this purpose, perform a quick and dirty // difference, by checking for Step in the operand list. SmallVector DiffOps; for (const SCEV *Op : SA->operands()) if (Op != Step) DiffOps.push_back(Op); if (DiffOps.size() == SA->getNumOperands()) return nullptr; // Try to prove `WrapType` (SCEV::FlagNSW or SCEV::FlagNUW) on `PreStart` + // `Step`: // 1. NSW/NUW flags on the step increment. auto PreStartFlags = ScalarEvolution::maskFlags(SA->getNoWrapFlags(), SCEV::FlagNUW); const SCEV *PreStart = SE->getAddExpr(DiffOps, PreStartFlags); const SCEVAddRecExpr *PreAR = dyn_cast( SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap)); // "{S,+,X} is /" and "the backedge is taken at least once" implies // "S+X does not sign/unsign-overflow". // const SCEV *BECount = SE->getBackedgeTakenCount(L); if (PreAR && PreAR->getNoWrapFlags(WrapType) && !isa(BECount) && SE->isKnownPositive(BECount)) return PreStart; // 2. Direct overflow check on the step operation's expression. unsigned BitWidth = SE->getTypeSizeInBits(AR->getType()); Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2); const SCEV *OperandExtendedStart = SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Depth), (SE->*GetExtendExpr)(Step, WideTy, Depth)); if ((SE->*GetExtendExpr)(Start, WideTy, Depth) == OperandExtendedStart) { if (PreAR && AR->getNoWrapFlags(WrapType)) { // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then // `PreAR` == {`PreStart`,+,`Step`} is also `WrapType`. Cache this fact. SE->setNoWrapFlags(const_cast(PreAR), WrapType); } return PreStart; } // 3. Loop precondition. ICmpInst::Predicate Pred; const SCEV *OverflowLimit = ExtendOpTraits::getOverflowLimitForStep(Step, &Pred, SE); if (OverflowLimit && SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) return PreStart; return nullptr; } // Get the normalized zero or sign extended expression for this AddRec's Start. template static const SCEV *getExtendAddRecStart(const SCEVAddRecExpr *AR, Type *Ty, ScalarEvolution *SE, unsigned Depth) { auto GetExtendExpr = ExtendOpTraits::GetExtendExpr; const SCEV *PreStart = getPreStartForExtend(AR, Ty, SE, Depth); if (!PreStart) return (SE->*GetExtendExpr)(AR->getStart(), Ty, Depth); return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty, Depth), (SE->*GetExtendExpr)(PreStart, Ty, Depth)); } // Try to prove away overflow by looking at "nearby" add recurrences. A // motivating example for this rule: if we know `{0,+,4}` is `ult` `-1` and it // does not itself wrap then we can conclude that `{1,+,4}` is `nuw`. // // Formally: // // {S,+,X} == {S-T,+,X} + T // => Ext({S,+,X}) == Ext({S-T,+,X} + T) // // If ({S-T,+,X} + T) does not overflow ... (1) // // RHS == Ext({S-T,+,X} + T) == Ext({S-T,+,X}) + Ext(T) // // If {S-T,+,X} does not overflow ... (2) // // RHS == Ext({S-T,+,X}) + Ext(T) == {Ext(S-T),+,Ext(X)} + Ext(T) // == {Ext(S-T)+Ext(T),+,Ext(X)} // // If (S-T)+T does not overflow ... (3) // // RHS == {Ext(S-T)+Ext(T),+,Ext(X)} == {Ext(S-T+T),+,Ext(X)} // == {Ext(S),+,Ext(X)} == LHS // // Thus, if (1), (2) and (3) are true for some T, then // Ext({S,+,X}) == {Ext(S),+,Ext(X)} // // (3) is implied by (1) -- "(S-T)+T does not overflow" is simply "({S-T,+,X}+T) // does not overflow" restricted to the 0th iteration. Therefore we only need // to check for (1) and (2). // // In the current context, S is `Start`, X is `Step`, Ext is `ExtendOpTy` and T // is `Delta` (defined below). template bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start, const SCEV *Step, const Loop *L) { auto WrapType = ExtendOpTraits::WrapType; // We restrict `Start` to a constant to prevent SCEV from spending too much // time here. It is correct (but more expensive) to continue with a // non-constant `Start` and do a general SCEV subtraction to compute // `PreStart` below. const SCEVConstant *StartC = dyn_cast(Start); if (!StartC) return false; APInt StartAI = StartC->getAPInt(); for (unsigned Delta : {-2, -1, 1, 2}) { const SCEV *PreStart = getConstant(StartAI - Delta); FoldingSetNodeID ID; ID.AddInteger(scAddRecExpr); ID.AddPointer(PreStart); ID.AddPointer(Step); ID.AddPointer(L); void *IP = nullptr; const auto *PreAR = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); // Give up if we don't already have the add recurrence we need because // actually constructing an add recurrence is relatively expensive. if (PreAR && PreAR->getNoWrapFlags(WrapType)) { // proves (2) const SCEV *DeltaS = getConstant(StartC->getType(), Delta); ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; const SCEV *Limit = ExtendOpTraits::getOverflowLimitForStep( DeltaS, &Pred, this); if (Limit && isKnownPredicate(Pred, PreAR, Limit)) // proves (1) return true; } } return false; } // Finds an integer D for an expression (C + x + y + ...) such that the top // level addition in (D + (C - D + x + y + ...)) would not wrap (signed or // unsigned) and the number of trailing zeros of (C - D + x + y + ...) is // maximized, where C is the \p ConstantTerm, x, y, ... are arbitrary SCEVs, and // the (C + x + y + ...) expression is \p WholeAddExpr. static APInt extractConstantWithoutWrapping(ScalarEvolution &SE, const SCEVConstant *ConstantTerm, const SCEVAddExpr *WholeAddExpr) { const APInt &C = ConstantTerm->getAPInt(); const unsigned BitWidth = C.getBitWidth(); // Find number of trailing zeros of (x + y + ...) w/o the C first: uint32_t TZ = BitWidth; for (unsigned I = 1, E = WholeAddExpr->getNumOperands(); I < E && TZ; ++I) TZ = std::min(TZ, SE.getMinTrailingZeros(WholeAddExpr->getOperand(I))); if (TZ) { // Set D to be as many least significant bits of C as possible while still // guaranteeing that adding D to (C - D + x + y + ...) won't cause a wrap: return TZ < BitWidth ? C.trunc(TZ).zext(BitWidth) : C; } return APInt(BitWidth, 0); } // Finds an integer D for an affine AddRec expression {C,+,x} such that the top // level addition in (D + {C-D,+,x}) would not wrap (signed or unsigned) and the // number of trailing zeros of (C - D + x * n) is maximized, where C is the \p // ConstantStart, x is an arbitrary \p Step, and n is the loop trip count. static APInt extractConstantWithoutWrapping(ScalarEvolution &SE, const APInt &ConstantStart, const SCEV *Step) { const unsigned BitWidth = ConstantStart.getBitWidth(); const uint32_t TZ = SE.getMinTrailingZeros(Step); if (TZ) return TZ < BitWidth ? ConstantStart.trunc(TZ).zext(BitWidth) : ConstantStart; return APInt(BitWidth, 0); } static void insertFoldCacheEntry( const ScalarEvolution::FoldID &ID, const SCEV *S, DenseMap &FoldCache, DenseMap> &FoldCacheUser) { auto I = FoldCache.insert({ID, S}); if (!I.second) { // Remove FoldCacheUser entry for ID when replacing an existing FoldCache // entry. auto &UserIDs = FoldCacheUser[I.first->second]; assert(count(UserIDs, ID) == 1 && "unexpected duplicates in UserIDs"); for (unsigned I = 0; I != UserIDs.size(); ++I) if (UserIDs[I] == ID) { std::swap(UserIDs[I], UserIDs.back()); break; } UserIDs.pop_back(); I.first->second = S; } auto R = FoldCacheUser.insert({S, {}}); R.first->second.push_back(ID); } const SCEV * ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); assert(!Op->getType()->isPointerTy() && "Can't extend pointer!"); Ty = getEffectiveSCEVType(Ty); FoldID ID(scZeroExtend, Op, Ty); auto Iter = FoldCache.find(ID); if (Iter != FoldCache.end()) return Iter->second; const SCEV *S = getZeroExtendExprImpl(Op, Ty, Depth); if (!isa(S)) insertFoldCacheEntry(ID, S, FoldCache, FoldCacheUser); return S; } const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); assert(!Op->getType()->isPointerTy() && "Can't extend pointer!"); // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast(Op)) return getConstant( cast(ConstantExpr::getZExt(SC->getValue(), Ty))); // zext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast(Op)) return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1); // Before doing any expensive analysis, check to see if we've already // computed a SCEV for this Op and Ty. FoldingSetNodeID ID; ID.AddInteger(scZeroExtend); ID.AddPointer(Op); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; if (Depth > MaxCastDepth) { SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Op); return S; } // zext(trunc(x)) --> zext(x) or x or trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast(Op)) { // It's possible the bits taken off by the truncate were all zero bits. If // so, we should be able to simplify this further. const SCEV *X = ST->getOperand(); ConstantRange CR = getUnsignedRange(X); unsigned TruncBits = getTypeSizeInBits(ST->getType()); unsigned NewBits = getTypeSizeInBits(Ty); if (CR.truncate(TruncBits).zeroExtend(NewBits).contains( CR.zextOrTrunc(NewBits))) return getTruncateOrZeroExtend(X, Ty, Depth); } // If the input value is a chrec scev, and we can prove that the value // did not overflow the old, smaller, value, we can zero extend all of the // operands (often constants). This allows analysis of something like // this: for (unsigned char X = 0; X < 100; ++X) { int Y = X; } if (const SCEVAddRecExpr *AR = dyn_cast(Op)) if (AR->isAffine()) { const SCEV *Start = AR->getStart(); const SCEV *Step = AR->getStepRecurrence(*this); unsigned BitWidth = getTypeSizeInBits(AR->getType()); const Loop *L = AR->getLoop(); // If we have special knowledge that this addrec won't overflow, // we don't need to do any further analysis. if (AR->hasNoUnsignedWrap()) { Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getZeroExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are // simply not analyzable, and it covers the case where this code is // being called from within backedge-taken count analysis, such that // attempting to ask for the backedge-taken count would likely result // in infinite recursion. In the later case, the analysis code will // cope with a conservative value, and it will take care to purge // that value once it has finished. const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L); if (!isa(MaxBECount)) { // Manually compute the final value for AR, checking for overflow. // Check whether the backedge-taken count can be losslessly casted to // the addrec's type. The count is always unsigned. const SCEV *CastedMaxBECount = getTruncateOrZeroExtend(MaxBECount, Start->getType(), Depth); const SCEV *RecastedMaxBECount = getTruncateOrZeroExtend( CastedMaxBECount, MaxBECount->getType(), Depth); if (MaxBECount == RecastedMaxBECount) { Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no unsigned overflow. const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step, SCEV::FlagAnyWrap, Depth + 1); const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul, SCEV::FlagAnyWrap, Depth + 1), WideTy, Depth + 1); const SCEV *WideStart = getZeroExtendExpr(Start, WideTy, Depth + 1); const SCEV *WideMaxBECount = getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1); const SCEV *OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getZeroExtendExpr(Step, WideTy, Depth + 1), SCEV::FlagAnyWrap, Depth + 1), SCEV::FlagAnyWrap, Depth + 1); if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NUW, which is propagated to this AddRec. setNoWrapFlags(const_cast(AR), SCEV::FlagNUW); // Return the expression with the addrec on the outside. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getZeroExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } // Similar to above, only this time treat the step value as signed. // This covers loops that count down. OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getSignExtendExpr(Step, WideTy, Depth + 1), SCEV::FlagAnyWrap, Depth + 1), SCEV::FlagAnyWrap, Depth + 1); if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NW, which is propagated to this AddRec. // Negative step causes unsigned wrap, but it still can't self-wrap. setNoWrapFlags(const_cast(AR), SCEV::FlagNW); // Return the expression with the addrec on the outside. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getSignExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } } } // Normally, in the cases we can prove no-overflow via a // backedge guarding condition, we can also compute a backedge // taken count for the loop. The exceptions are assumptions and // guards present in the loop -- SCEV is not great at exploiting // these to compute max backedge taken counts, but can still use // these to prove lack of overflow. Use this fact to avoid // doing extra work that may not pay off. if (!isa(MaxBECount) || HasGuards || !AC.assumptions().empty()) { auto NewFlags = proveNoUnsignedWrapViaInduction(AR); setNoWrapFlags(const_cast(AR), NewFlags); if (AR->hasNoUnsignedWrap()) { // Same as nuw case above - duplicated here to avoid a compile time // issue. It's not clear that the order of checks does matter, but // it's one of two issue possible causes for a change which was // reverted. Be conservative for the moment. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getZeroExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } // For a negative step, we can extend the operands iff doing so only // traverses values in the range zext([0,UINT_MAX]). if (isKnownNegative(Step)) { const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) - getSignedRangeMin(Step)); if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) || isKnownOnEveryIteration(ICmpInst::ICMP_UGT, AR, N)) { // Cache knowledge of AR NW, which is propagated to this // AddRec. Negative step causes unsigned wrap, but it // still can't self-wrap. setNoWrapFlags(const_cast(AR), SCEV::FlagNW); // Return the expression with the addrec on the outside. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getSignExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } } } // zext({C,+,Step}) --> (zext(D) + zext({C-D,+,Step})) // if D + (C - D + Step * n) could be proven to not unsigned wrap // where D maximizes the number of trailing zeros of (C - D + Step * n) if (const auto *SC = dyn_cast(Start)) { const APInt &C = SC->getAPInt(); const APInt &D = extractConstantWithoutWrapping(*this, C, Step); if (D != 0) { const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth); const SCEV *SResidual = getAddRecExpr(getConstant(C - D), Step, L, AR->getNoWrapFlags()); const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1); return getAddExpr(SZExtD, SZExtR, (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW), Depth + 1); } } if (proveNoWrapByVaryingStart(Start, Step, L)) { setNoWrapFlags(const_cast(AR), SCEV::FlagNUW); Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getZeroExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } } // zext(A % B) --> zext(A) % zext(B) { const SCEV *LHS; const SCEV *RHS; if (matchURem(Op, LHS, RHS)) return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1), getZeroExtendExpr(RHS, Ty, Depth + 1)); } // zext(A / B) --> zext(A) / zext(B). if (auto *Div = dyn_cast(Op)) return getUDivExpr(getZeroExtendExpr(Div->getLHS(), Ty, Depth + 1), getZeroExtendExpr(Div->getRHS(), Ty, Depth + 1)); if (auto *SA = dyn_cast(Op)) { // zext((A + B + ...)) --> (zext(A) + zext(B) + ...) if (SA->hasNoUnsignedWrap()) { // If the addition does not unsign overflow then we can, by definition, // commute the zero extension with the addition operation. SmallVector Ops; for (const auto *Op : SA->operands()) Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1)); return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1); } // zext(C + x + y + ...) --> (zext(D) + zext((C - D) + x + y + ...)) // if D + (C - D + x + y + ...) could be proven to not unsigned wrap // where D maximizes the number of trailing zeros of (C - D + x + y + ...) // // Often address arithmetics contain expressions like // (zext (add (shl X, C1), C2)), for instance, (zext (5 + (4 * X))). // This transformation is useful while proving that such expressions are // equal or differ by a small constant amount, see LoadStoreVectorizer pass. if (const auto *SC = dyn_cast(SA->getOperand(0))) { const APInt &D = extractConstantWithoutWrapping(*this, SC, SA); if (D != 0) { const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth); const SCEV *SResidual = getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth); const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1); return getAddExpr(SZExtD, SZExtR, (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW), Depth + 1); } } } if (auto *SM = dyn_cast(Op)) { // zext((A * B * ...)) --> (zext(A) * zext(B) * ...) if (SM->hasNoUnsignedWrap()) { // If the multiply does not unsign overflow then we can, by definition, // commute the zero extension with the multiply operation. SmallVector Ops; for (const auto *Op : SM->operands()) Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1)); return getMulExpr(Ops, SCEV::FlagNUW, Depth + 1); } // zext(2^K * (trunc X to iN)) to iM -> // 2^K * (zext(trunc X to i{N-K}) to iM) // // Proof: // // zext(2^K * (trunc X to iN)) to iM // = zext((trunc X to iN) << K) to iM // = zext((trunc X to i{N-K}) << K) to iM // (because shl removes the top K bits) // = zext((2^K * (trunc X to i{N-K}))) to iM // = (2^K * (zext(trunc X to i{N-K}) to iM)). // if (SM->getNumOperands() == 2) if (auto *MulLHS = dyn_cast(SM->getOperand(0))) if (MulLHS->getAPInt().isPowerOf2()) if (auto *TruncRHS = dyn_cast(SM->getOperand(1))) { int NewTruncBits = getTypeSizeInBits(TruncRHS->getType()) - MulLHS->getAPInt().logBase2(); Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits); return getMulExpr( getZeroExtendExpr(MulLHS, Ty), getZeroExtendExpr( getTruncateExpr(TruncRHS->getOperand(), NewTruncTy), Ty), SCEV::FlagNUW, Depth + 1); } } // zext(umin(x, y)) -> umin(zext(x), zext(y)) // zext(umax(x, y)) -> umax(zext(x), zext(y)) if (isa(Op) || isa(Op)) { auto *MinMax = cast(Op); SmallVector Operands; for (auto *Operand : MinMax->operands()) Operands.push_back(getZeroExtendExpr(Operand, Ty)); if (isa(MinMax)) return getUMinExpr(Operands); return getUMaxExpr(Operands); } // zext(umin_seq(x, y)) -> umin_seq(zext(x), zext(y)) if (auto *MinMax = dyn_cast(Op)) { assert(isa(MinMax) && "Not supported!"); SmallVector Operands; for (auto *Operand : MinMax->operands()) Operands.push_back(getZeroExtendExpr(Operand, Ty)); return getUMinExpr(Operands, /*Sequential*/ true); } // The cast wasn't folded; create an explicit cast node. // Recompute the insert position, as it may have been invalidated. if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Op); return S; } const SCEV * ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); assert(!Op->getType()->isPointerTy() && "Can't extend pointer!"); Ty = getEffectiveSCEVType(Ty); FoldID ID(scSignExtend, Op, Ty); auto Iter = FoldCache.find(ID); if (Iter != FoldCache.end()) return Iter->second; const SCEV *S = getSignExtendExprImpl(Op, Ty, Depth); if (!isa(S)) insertFoldCacheEntry(ID, S, FoldCache, FoldCacheUser); return S; } const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty, unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); assert(!Op->getType()->isPointerTy() && "Can't extend pointer!"); Ty = getEffectiveSCEVType(Ty); // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast(Op)) return getConstant( cast(ConstantExpr::getSExt(SC->getValue(), Ty))); // sext(sext(x)) --> sext(x) if (const SCEVSignExtendExpr *SS = dyn_cast(Op)) return getSignExtendExpr(SS->getOperand(), Ty, Depth + 1); // sext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast(Op)) return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1); // Before doing any expensive analysis, check to see if we've already // computed a SCEV for this Op and Ty. FoldingSetNodeID ID; ID.AddInteger(scSignExtend); ID.AddPointer(Op); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // Limit recursion depth. if (Depth > MaxCastDepth) { SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Op); return S; } // sext(trunc(x)) --> sext(x) or x or trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast(Op)) { // It's possible the bits taken off by the truncate were all sign bits. If // so, we should be able to simplify this further. const SCEV *X = ST->getOperand(); ConstantRange CR = getSignedRange(X); unsigned TruncBits = getTypeSizeInBits(ST->getType()); unsigned NewBits = getTypeSizeInBits(Ty); if (CR.truncate(TruncBits).signExtend(NewBits).contains( CR.sextOrTrunc(NewBits))) return getTruncateOrSignExtend(X, Ty, Depth); } if (auto *SA = dyn_cast(Op)) { // sext((A + B + ...)) --> (sext(A) + sext(B) + ...) if (SA->hasNoSignedWrap()) { // If the addition does not sign overflow then we can, by definition, // commute the sign extension with the addition operation. SmallVector Ops; for (const auto *Op : SA->operands()) Ops.push_back(getSignExtendExpr(Op, Ty, Depth + 1)); return getAddExpr(Ops, SCEV::FlagNSW, Depth + 1); } // sext(C + x + y + ...) --> (sext(D) + sext((C - D) + x + y + ...)) // if D + (C - D + x + y + ...) could be proven to not signed wrap // where D maximizes the number of trailing zeros of (C - D + x + y + ...) // // For instance, this will bring two seemingly different expressions: // 1 + sext(5 + 20 * %x + 24 * %y) and // sext(6 + 20 * %x + 24 * %y) // to the same form: // 2 + sext(4 + 20 * %x + 24 * %y) if (const auto *SC = dyn_cast(SA->getOperand(0))) { const APInt &D = extractConstantWithoutWrapping(*this, SC, SA); if (D != 0) { const SCEV *SSExtD = getSignExtendExpr(getConstant(D), Ty, Depth); const SCEV *SResidual = getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth); const SCEV *SSExtR = getSignExtendExpr(SResidual, Ty, Depth + 1); return getAddExpr(SSExtD, SSExtR, (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW), Depth + 1); } } } // If the input value is a chrec scev, and we can prove that the value // did not overflow the old, smaller, value, we can sign extend all of the // operands (often constants). This allows analysis of something like // this: for (signed char X = 0; X < 100; ++X) { int Y = X; } if (const SCEVAddRecExpr *AR = dyn_cast(Op)) if (AR->isAffine()) { const SCEV *Start = AR->getStart(); const SCEV *Step = AR->getStepRecurrence(*this); unsigned BitWidth = getTypeSizeInBits(AR->getType()); const Loop *L = AR->getLoop(); // If we have special knowledge that this addrec won't overflow, // we don't need to do any further analysis. if (AR->hasNoSignedWrap()) { Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getSignExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, SCEV::FlagNSW); } // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are // simply not analyzable, and it covers the case where this code is // being called from within backedge-taken count analysis, such that // attempting to ask for the backedge-taken count would likely result // in infinite recursion. In the later case, the analysis code will // cope with a conservative value, and it will take care to purge // that value once it has finished. const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L); if (!isa(MaxBECount)) { // Manually compute the final value for AR, checking for // overflow. // Check whether the backedge-taken count can be losslessly casted to // the addrec's type. The count is always unsigned. const SCEV *CastedMaxBECount = getTruncateOrZeroExtend(MaxBECount, Start->getType(), Depth); const SCEV *RecastedMaxBECount = getTruncateOrZeroExtend( CastedMaxBECount, MaxBECount->getType(), Depth); if (MaxBECount == RecastedMaxBECount) { Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no signed overflow. const SCEV *SMul = getMulExpr(CastedMaxBECount, Step, SCEV::FlagAnyWrap, Depth + 1); const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul, SCEV::FlagAnyWrap, Depth + 1), WideTy, Depth + 1); const SCEV *WideStart = getSignExtendExpr(Start, WideTy, Depth + 1); const SCEV *WideMaxBECount = getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1); const SCEV *OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getSignExtendExpr(Step, WideTy, Depth + 1), SCEV::FlagAnyWrap, Depth + 1), SCEV::FlagAnyWrap, Depth + 1); if (SAdd == OperandExtendedAdd) { // Cache knowledge of AR NSW, which is propagated to this AddRec. setNoWrapFlags(const_cast(AR), SCEV::FlagNSW); // Return the expression with the addrec on the outside. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getSignExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } // Similar to above, only this time treat the step value as unsigned. // This covers loops that count up with an unsigned step. OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getZeroExtendExpr(Step, WideTy, Depth + 1), SCEV::FlagAnyWrap, Depth + 1), SCEV::FlagAnyWrap, Depth + 1); if (SAdd == OperandExtendedAdd) { // If AR wraps around then // // abs(Step) * MaxBECount > unsigned-max(AR->getType()) // => SAdd != OperandExtendedAdd // // Thus (AR is not NW => SAdd != OperandExtendedAdd) <=> // (SAdd == OperandExtendedAdd => AR is NW) setNoWrapFlags(const_cast(AR), SCEV::FlagNW); // Return the expression with the addrec on the outside. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getZeroExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } } } auto NewFlags = proveNoSignedWrapViaInduction(AR); setNoWrapFlags(const_cast(AR), NewFlags); if (AR->hasNoSignedWrap()) { // Same as nsw case above - duplicated here to avoid a compile time // issue. It's not clear that the order of checks does matter, but // it's one of two issue possible causes for a change which was // reverted. Be conservative for the moment. Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getSignExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } // sext({C,+,Step}) --> (sext(D) + sext({C-D,+,Step})) // if D + (C - D + Step * n) could be proven to not signed wrap // where D maximizes the number of trailing zeros of (C - D + Step * n) if (const auto *SC = dyn_cast(Start)) { const APInt &C = SC->getAPInt(); const APInt &D = extractConstantWithoutWrapping(*this, C, Step); if (D != 0) { const SCEV *SSExtD = getSignExtendExpr(getConstant(D), Ty, Depth); const SCEV *SResidual = getAddRecExpr(getConstant(C - D), Step, L, AR->getNoWrapFlags()); const SCEV *SSExtR = getSignExtendExpr(SResidual, Ty, Depth + 1); return getAddExpr(SSExtD, SSExtR, (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW), Depth + 1); } } if (proveNoWrapByVaryingStart(Start, Step, L)) { setNoWrapFlags(const_cast(AR), SCEV::FlagNSW); Start = getExtendAddRecStart(AR, Ty, this, Depth + 1); Step = getSignExtendExpr(Step, Ty, Depth + 1); return getAddRecExpr(Start, Step, L, AR->getNoWrapFlags()); } } // If the input value is provably positive and we could not simplify // away the sext build a zext instead. if (isKnownNonNegative(Op)) return getZeroExtendExpr(Op, Ty, Depth + 1); // sext(smin(x, y)) -> smin(sext(x), sext(y)) // sext(smax(x, y)) -> smax(sext(x), sext(y)) if (isa(Op) || isa(Op)) { auto *MinMax = cast(Op); SmallVector Operands; for (auto *Operand : MinMax->operands()) Operands.push_back(getSignExtendExpr(Operand, Ty)); if (isa(MinMax)) return getSMinExpr(Operands); return getSMaxExpr(Operands); } // The cast wasn't folded; create an explicit cast node. // Recompute the insert position, as it may have been invalidated. if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); registerUser(S, { Op }); return S; } const SCEV *ScalarEvolution::getCastExpr(SCEVTypes Kind, const SCEV *Op, Type *Ty) { switch (Kind) { case scTruncate: return getTruncateExpr(Op, Ty); case scZeroExtend: return getZeroExtendExpr(Op, Ty); case scSignExtend: return getSignExtendExpr(Op, Ty); case scPtrToInt: return getPtrToIntExpr(Op, Ty); default: llvm_unreachable("Not a SCEV cast expression!"); } } /// getAnyExtendExpr - Return a SCEV for the given operand extended with /// unspecified bits out to the given type. const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op, Type *Ty) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); Ty = getEffectiveSCEVType(Ty); // Sign-extend negative constants. if (const SCEVConstant *SC = dyn_cast(Op)) if (SC->getAPInt().isNegative()) return getSignExtendExpr(Op, Ty); // Peel off a truncate cast. if (const SCEVTruncateExpr *T = dyn_cast(Op)) { const SCEV *NewOp = T->getOperand(); if (getTypeSizeInBits(NewOp->getType()) < getTypeSizeInBits(Ty)) return getAnyExtendExpr(NewOp, Ty); return getTruncateOrNoop(NewOp, Ty); } // Next try a zext cast. If the cast is folded, use it. const SCEV *ZExt = getZeroExtendExpr(Op, Ty); if (!isa(ZExt)) return ZExt; // Next try a sext cast. If the cast is folded, use it. const SCEV *SExt = getSignExtendExpr(Op, Ty); if (!isa(SExt)) return SExt; // Force the cast to be folded into the operands of an addrec. if (const SCEVAddRecExpr *AR = dyn_cast(Op)) { SmallVector Ops; for (const SCEV *Op : AR->operands()) Ops.push_back(getAnyExtendExpr(Op, Ty)); return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW); } // If the expression is obviously signed, use the sext cast value. if (isa(Op)) return SExt; // Absent any other information, use the zext cast value. return ZExt; } /// Process the given Ops list, which is a list of operands to be added under /// the given scale, update the given map. This is a helper function for /// getAddRecExpr. As an example of what it does, given a sequence of operands /// that would form an add expression like this: /// /// m + n + 13 + (A * (o + p + (B * (q + m + 29)))) + r + (-1 * r) /// /// where A and B are constants, update the map with these values: /// /// (m, 1+A*B), (n, 1), (o, A), (p, A), (q, A*B), (r, 0) /// /// and add 13 + A*B*29 to AccumulatedConstant. /// This will allow getAddRecExpr to produce this: /// /// 13+A*B*29 + n + (m * (1+A*B)) + ((o + p) * A) + (q * A*B) /// /// This form often exposes folding opportunities that are hidden in /// the original operand list. /// /// Return true iff it appears that any interesting folding opportunities /// may be exposed. This helps getAddRecExpr short-circuit extra work in /// the common case where no interesting opportunities are present, and /// is also used as a check to avoid infinite recursion. static bool CollectAddOperandsWithScales(DenseMap &M, SmallVectorImpl &NewOps, APInt &AccumulatedConstant, ArrayRef Ops, const APInt &Scale, ScalarEvolution &SE) { bool Interesting = false; // Iterate over the add operands. They are sorted, with constants first. unsigned i = 0; while (const SCEVConstant *C = dyn_cast(Ops[i])) { ++i; // Pull a buried constant out to the outside. if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero()) Interesting = true; AccumulatedConstant += Scale * C->getAPInt(); } // Next comes everything else. We're especially interested in multiplies // here, but they're in the middle, so just visit the rest with one loop. for (; i != Ops.size(); ++i) { const SCEVMulExpr *Mul = dyn_cast(Ops[i]); if (Mul && isa(Mul->getOperand(0))) { APInt NewScale = Scale * cast(Mul->getOperand(0))->getAPInt(); if (Mul->getNumOperands() == 2 && isa(Mul->getOperand(1))) { // A multiplication of a constant with another add; recurse. const SCEVAddExpr *Add = cast(Mul->getOperand(1)); Interesting |= CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant, Add->operands(), NewScale, SE); } else { // A multiplication of a constant with some other value. Update // the map. SmallVector MulOps(drop_begin(Mul->operands())); const SCEV *Key = SE.getMulExpr(MulOps); auto Pair = M.insert({Key, NewScale}); if (Pair.second) { NewOps.push_back(Pair.first->first); } else { Pair.first->second += NewScale; // The map already had an entry for this value, which may indicate // a folding opportunity. Interesting = true; } } } else { // An ordinary operand. Update the map. std::pair::iterator, bool> Pair = M.insert({Ops[i], Scale}); if (Pair.second) { NewOps.push_back(Pair.first->first); } else { Pair.first->second += Scale; // The map already had an entry for this value, which may indicate // a folding opportunity. Interesting = true; } } } return Interesting; } bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed, const SCEV *LHS, const SCEV *RHS, const Instruction *CtxI) { const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned); switch (BinOp) { default: llvm_unreachable("Unsupported binary op"); case Instruction::Add: Operation = &ScalarEvolution::getAddExpr; break; case Instruction::Sub: Operation = &ScalarEvolution::getMinusSCEV; break; case Instruction::Mul: Operation = &ScalarEvolution::getMulExpr; break; } const SCEV *(ScalarEvolution::*Extension)(const SCEV *, Type *, unsigned) = Signed ? &ScalarEvolution::getSignExtendExpr : &ScalarEvolution::getZeroExtendExpr; // Check ext(LHS op RHS) == ext(LHS) op ext(RHS) auto *NarrowTy = cast(LHS->getType()); auto *WideTy = IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2); const SCEV *A = (this->*Extension)( (this->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0), WideTy, 0); const SCEV *LHSB = (this->*Extension)(LHS, WideTy, 0); const SCEV *RHSB = (this->*Extension)(RHS, WideTy, 0); const SCEV *B = (this->*Operation)(LHSB, RHSB, SCEV::FlagAnyWrap, 0); if (A == B) return true; // Can we use context to prove the fact we need? if (!CtxI) return false; // TODO: Support mul. if (BinOp == Instruction::Mul) return false; auto *RHSC = dyn_cast(RHS); // TODO: Lift this limitation. if (!RHSC) return false; APInt C = RHSC->getAPInt(); unsigned NumBits = C.getBitWidth(); bool IsSub = (BinOp == Instruction::Sub); bool IsNegativeConst = (Signed && C.isNegative()); // Compute the direction and magnitude by which we need to check overflow. bool OverflowDown = IsSub ^ IsNegativeConst; APInt Magnitude = C; if (IsNegativeConst) { if (C == APInt::getSignedMinValue(NumBits)) // TODO: SINT_MIN on inversion gives the same negative value, we don't // want to deal with that. return false; Magnitude = -C; } ICmpInst::Predicate Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; if (OverflowDown) { // To avoid overflow down, we need to make sure that MIN + Magnitude <= LHS. APInt Min = Signed ? APInt::getSignedMinValue(NumBits) : APInt::getMinValue(NumBits); APInt Limit = Min + Magnitude; return isKnownPredicateAt(Pred, getConstant(Limit), LHS, CtxI); } else { // To avoid overflow up, we need to make sure that LHS <= MAX - Magnitude. APInt Max = Signed ? APInt::getSignedMaxValue(NumBits) : APInt::getMaxValue(NumBits); APInt Limit = Max - Magnitude; return isKnownPredicateAt(Pred, LHS, getConstant(Limit), CtxI); } } std::optional ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp( const OverflowingBinaryOperator *OBO) { // It cannot be done any better. if (OBO->hasNoUnsignedWrap() && OBO->hasNoSignedWrap()) return std::nullopt; SCEV::NoWrapFlags Flags = SCEV::NoWrapFlags::FlagAnyWrap; if (OBO->hasNoUnsignedWrap()) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); if (OBO->hasNoSignedWrap()) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); bool Deduced = false; if (OBO->getOpcode() != Instruction::Add && OBO->getOpcode() != Instruction::Sub && OBO->getOpcode() != Instruction::Mul) return std::nullopt; const SCEV *LHS = getSCEV(OBO->getOperand(0)); const SCEV *RHS = getSCEV(OBO->getOperand(1)); const Instruction *CtxI = UseContextForNoWrapFlagInference ? dyn_cast(OBO) : nullptr; if (!OBO->hasNoUnsignedWrap() && willNotOverflow((Instruction::BinaryOps)OBO->getOpcode(), /* Signed */ false, LHS, RHS, CtxI)) { Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); Deduced = true; } if (!OBO->hasNoSignedWrap() && willNotOverflow((Instruction::BinaryOps)OBO->getOpcode(), /* Signed */ true, LHS, RHS, CtxI)) { Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); Deduced = true; } if (Deduced) return Flags; return std::nullopt; } // We're trying to construct a SCEV of type `Type' with `Ops' as operands and // `OldFlags' as can't-wrap behavior. Infer a more aggressive set of // can't-overflow flags for the operation if possible. static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, const ArrayRef Ops, SCEV::NoWrapFlags Flags) { using namespace std::placeholders; using OBO = OverflowingBinaryOperator; bool CanAnalyze = Type == scAddExpr || Type == scAddRecExpr || Type == scMulExpr; (void)CanAnalyze; assert(CanAnalyze && "don't call from other places!"); int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW; SCEV::NoWrapFlags SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask); // If FlagNSW is true and all the operands are non-negative, infer FlagNUW. auto IsKnownNonNegative = [&](const SCEV *S) { return SE->isKnownNonNegative(S); }; if (SignOrUnsignWrap == SCEV::FlagNSW && all_of(Ops, IsKnownNonNegative)) Flags = ScalarEvolution::setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask); SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask); if (SignOrUnsignWrap != SignOrUnsignMask && (Type == scAddExpr || Type == scMulExpr) && Ops.size() == 2 && isa(Ops[0])) { auto Opcode = [&] { switch (Type) { case scAddExpr: return Instruction::Add; case scMulExpr: return Instruction::Mul; default: llvm_unreachable("Unexpected SCEV op."); } }(); const APInt &C = cast(Ops[0])->getAPInt(); // (A C) --> (A C) if the op doesn't sign overflow. if (!(SignOrUnsignWrap & SCEV::FlagNSW)) { auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Opcode, C, OBO::NoSignedWrap); if (NSWRegion.contains(SE->getSignedRange(Ops[1]))) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); } // (A C) --> (A C) if the op doesn't unsign overflow. if (!(SignOrUnsignWrap & SCEV::FlagNUW)) { auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Opcode, C, OBO::NoUnsignedWrap); if (NUWRegion.contains(SE->getUnsignedRange(Ops[1]))) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); } } // <0,+,nonnegative> is also nuw // TODO: Add corresponding nsw case if (Type == scAddRecExpr && ScalarEvolution::hasFlags(Flags, SCEV::FlagNW) && !ScalarEvolution::hasFlags(Flags, SCEV::FlagNUW) && Ops.size() == 2 && Ops[0]->isZero() && IsKnownNonNegative(Ops[1])) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); // both (udiv X, Y) * Y and Y * (udiv X, Y) are always NUW if (Type == scMulExpr && !ScalarEvolution::hasFlags(Flags, SCEV::FlagNUW) && Ops.size() == 2) { if (auto *UDiv = dyn_cast(Ops[0])) if (UDiv->getOperand(1) == Ops[1]) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); if (auto *UDiv = dyn_cast(Ops[1])) if (UDiv->getOperand(1) == Ops[0]) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); } return Flags; } bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L) { return isLoopInvariant(S, L) && properlyDominates(S, L->getHeader()); } /// Get a canonical add expression, or something simpler if possible. const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags OrigFlags, unsigned Depth) { assert(!(OrigFlags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) && "only nuw or nsw allowed"); assert(!Ops.empty() && "Cannot get empty add!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "SCEVAddExpr operand types don't match!"); unsigned NumPtrs = count_if( Ops, [](const SCEV *Op) { return Op->getType()->isPointerTy(); }); assert(NumPtrs <= 1 && "add has at most one pointer operand"); #endif // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI, DT); // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { ++Idx; assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! Ops[0] = getConstant(LHSC->getAPInt() + RHSC->getAPInt()); if (Ops.size() == 2) return Ops[0]; Ops.erase(Ops.begin()+1); // Erase the folded element LHSC = cast(Ops[0]); } // If we are left with a constant zero being added, strip it off. if (LHSC->getValue()->isZero()) { Ops.erase(Ops.begin()); --Idx; } if (Ops.size() == 1) return Ops[0]; } // Delay expensive flag strengthening until necessary. auto ComputeFlags = [this, OrigFlags](const ArrayRef Ops) { return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags); }; // Limit recursion calls depth. if (Depth > MaxArithDepth || hasHugeExpression(Ops)) return getOrCreateAddExpr(Ops, ComputeFlags(Ops)); if (SCEV *S = findExistingSCEVInCache(scAddExpr, Ops)) { // Don't strengthen flags if we have no new information. SCEVAddExpr *Add = static_cast(S); if (Add->getNoWrapFlags(OrigFlags) != OrigFlags) Add->setNoWrapFlags(ComputeFlags(Ops)); return S; } // Okay, check to see if the same value occurs in the operand list more than // once. If so, merge them together into an multiply expression. Since we // sorted the list, these values are required to be adjacent. Type *Ty = Ops[0]->getType(); bool FoundMatch = false; for (unsigned i = 0, e = Ops.size(); i != e-1; ++i) if (Ops[i] == Ops[i+1]) { // X + Y + Y --> X + Y*2 // Scan ahead to count how many equal operands there are. unsigned Count = 2; while (i+Count != e && Ops[i+Count] == Ops[i]) ++Count; // Merge the values into a multiply. const SCEV *Scale = getConstant(Ty, Count); const SCEV *Mul = getMulExpr(Scale, Ops[i], SCEV::FlagAnyWrap, Depth + 1); if (Ops.size() == Count) return Mul; Ops[i] = Mul; Ops.erase(Ops.begin()+i+1, Ops.begin()+i+Count); --i; e -= Count - 1; FoundMatch = true; } if (FoundMatch) return getAddExpr(Ops, OrigFlags, Depth + 1); // Check for truncates. If all the operands are truncated from the same // type, see if factoring out the truncate would permit the result to be // folded. eg., n*trunc(x) + m*trunc(y) --> trunc(trunc(m)*x + trunc(n)*y) // if the contents of the resulting outer trunc fold to something simple. auto FindTruncSrcType = [&]() -> Type * { // We're ultimately looking to fold an addrec of truncs and muls of only // constants and truncs, so if we find any other types of SCEV // as operands of the addrec then we bail and return nullptr here. // Otherwise, we return the type of the operand of a trunc that we find. if (auto *T = dyn_cast(Ops[Idx])) return T->getOperand()->getType(); if (const auto *Mul = dyn_cast(Ops[Idx])) { const auto *LastOp = Mul->getOperand(Mul->getNumOperands() - 1); if (const auto *T = dyn_cast(LastOp)) return T->getOperand()->getType(); } return nullptr; }; if (auto *SrcType = FindTruncSrcType()) { SmallVector LargeOps; bool Ok = true; // Check all the operands to see if they can be represented in the // source type of the truncate. for (unsigned i = 0, e = Ops.size(); i != e; ++i) { if (const SCEVTruncateExpr *T = dyn_cast(Ops[i])) { if (T->getOperand()->getType() != SrcType) { Ok = false; break; } LargeOps.push_back(T->getOperand()); } else if (const SCEVConstant *C = dyn_cast(Ops[i])) { LargeOps.push_back(getAnyExtendExpr(C, SrcType)); } else if (const SCEVMulExpr *M = dyn_cast(Ops[i])) { SmallVector LargeMulOps; for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) { if (const SCEVTruncateExpr *T = dyn_cast(M->getOperand(j))) { if (T->getOperand()->getType() != SrcType) { Ok = false; break; } LargeMulOps.push_back(T->getOperand()); } else if (const auto *C = dyn_cast(M->getOperand(j))) { LargeMulOps.push_back(getAnyExtendExpr(C, SrcType)); } else { Ok = false; break; } } if (Ok) LargeOps.push_back(getMulExpr(LargeMulOps, SCEV::FlagAnyWrap, Depth + 1)); } else { Ok = false; break; } } if (Ok) { // Evaluate the expression in the larger type. const SCEV *Fold = getAddExpr(LargeOps, SCEV::FlagAnyWrap, Depth + 1); // If it folds to something simple, use it. Otherwise, don't. if (isa(Fold) || isa(Fold)) return getTruncateExpr(Fold, Ty); } } if (Ops.size() == 2) { // Check if we have an expression of the form ((X + C1) - C2), where C1 and // C2 can be folded in a way that allows retaining wrapping flags of (X + // C1). const SCEV *A = Ops[0]; const SCEV *B = Ops[1]; auto *AddExpr = dyn_cast(B); auto *C = dyn_cast(A); if (AddExpr && C && isa(AddExpr->getOperand(0))) { auto C1 = cast(AddExpr->getOperand(0))->getAPInt(); auto C2 = C->getAPInt(); SCEV::NoWrapFlags PreservedFlags = SCEV::FlagAnyWrap; APInt ConstAdd = C1 + C2; auto AddFlags = AddExpr->getNoWrapFlags(); // Adding a smaller constant is NUW if the original AddExpr was NUW. if (ScalarEvolution::hasFlags(AddFlags, SCEV::FlagNUW) && ConstAdd.ule(C1)) { PreservedFlags = ScalarEvolution::setFlags(PreservedFlags, SCEV::FlagNUW); } // Adding a constant with the same sign and small magnitude is NSW, if the // original AddExpr was NSW. if (ScalarEvolution::hasFlags(AddFlags, SCEV::FlagNSW) && C1.isSignBitSet() == ConstAdd.isSignBitSet() && ConstAdd.abs().ule(C1.abs())) { PreservedFlags = ScalarEvolution::setFlags(PreservedFlags, SCEV::FlagNSW); } if (PreservedFlags != SCEV::FlagAnyWrap) { SmallVector NewOps(AddExpr->operands()); NewOps[0] = getConstant(ConstAdd); return getAddExpr(NewOps, PreservedFlags); } } } // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y) if (Ops.size() == 2) { const SCEVMulExpr *Mul = dyn_cast(Ops[0]); if (Mul && Mul->getNumOperands() == 2 && Mul->getOperand(0)->isAllOnesValue()) { const SCEV *X; const SCEV *Y; if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) { return getMulExpr(Y, getUDivExpr(X, Y)); } } } // Skip past any other cast SCEVs. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr) ++Idx; // If there are add operands they would be next. if (Idx < Ops.size()) { bool DeletedAdd = false; // If the original flags and all inlined SCEVAddExprs are NUW, use the // common NUW flag for expression after inlining. Other flags cannot be // preserved, because they may depend on the original order of operations. SCEV::NoWrapFlags CommonFlags = maskFlags(OrigFlags, SCEV::FlagNUW); while (const SCEVAddExpr *Add = dyn_cast(Ops[Idx])) { if (Ops.size() > AddOpsInlineThreshold || Add->getNumOperands() > AddOpsInlineThreshold) break; // If we have an add, expand the add operands onto the end of the operands // list. Ops.erase(Ops.begin()+Idx); append_range(Ops, Add->operands()); DeletedAdd = true; CommonFlags = maskFlags(CommonFlags, Add->getNoWrapFlags()); } // If we deleted at least one add, we added operands to the end of the list, // and they are not necessarily sorted. Recurse to resort and resimplify // any operands we just acquired. if (DeletedAdd) return getAddExpr(Ops, CommonFlags, Depth + 1); } // Skip over the add expression until we get to a multiply. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr) ++Idx; // Check to see if there are any folding opportunities present with // operands multiplied by constant values. if (Idx < Ops.size() && isa(Ops[Idx])) { uint64_t BitWidth = getTypeSizeInBits(Ty); DenseMap M; SmallVector NewOps; APInt AccumulatedConstant(BitWidth, 0); if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant, Ops, APInt(BitWidth, 1), *this)) { struct APIntCompare { bool operator()(const APInt &LHS, const APInt &RHS) const { return LHS.ult(RHS); } }; // Some interesting folding opportunity is present, so its worthwhile to // re-generate the operands list. Group the operands by constant scale, // to avoid multiplying by the same constant scale multiple times. std::map, APIntCompare> MulOpLists; for (const SCEV *NewOp : NewOps) MulOpLists[M.find(NewOp)->second].push_back(NewOp); // Re-generate the operands list. Ops.clear(); if (AccumulatedConstant != 0) Ops.push_back(getConstant(AccumulatedConstant)); for (auto &MulOp : MulOpLists) { if (MulOp.first == 1) { Ops.push_back(getAddExpr(MulOp.second, SCEV::FlagAnyWrap, Depth + 1)); } else if (MulOp.first != 0) { Ops.push_back(getMulExpr( getConstant(MulOp.first), getAddExpr(MulOp.second, SCEV::FlagAnyWrap, Depth + 1), SCEV::FlagAnyWrap, Depth + 1)); } } if (Ops.empty()) return getZero(Ty); if (Ops.size() == 1) return Ops[0]; return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } } // If we are adding something to a multiply expression, make sure the // something is not already an operand of the multiply. If so, merge it into // the multiply. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { const SCEVMulExpr *Mul = cast(Ops[Idx]); for (unsigned MulOp = 0, e = Mul->getNumOperands(); MulOp != e; ++MulOp) { const SCEV *MulOpSCEV = Mul->getOperand(MulOp); if (isa(MulOpSCEV)) continue; for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp) if (MulOpSCEV == Ops[AddOp]) { // Fold W + X + (X * Y * Z) --> W + (X * ((Y*Z)+1)) const SCEV *InnerMul = Mul->getOperand(MulOp == 0); if (Mul->getNumOperands() != 2) { // If the multiply has more than two operands, we must get the // Y*Z term. SmallVector MulOps( Mul->operands().take_front(MulOp)); append_range(MulOps, Mul->operands().drop_front(MulOp + 1)); InnerMul = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1); } SmallVector TwoOps = {getOne(Ty), InnerMul}; const SCEV *AddOne = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1); const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV, SCEV::FlagAnyWrap, Depth + 1); if (Ops.size() == 2) return OuterMul; if (AddOp < Idx) { Ops.erase(Ops.begin()+AddOp); Ops.erase(Ops.begin()+Idx-1); } else { Ops.erase(Ops.begin()+Idx); Ops.erase(Ops.begin()+AddOp-1); } Ops.push_back(OuterMul); return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } // Check this multiply against other multiplies being added together. for (unsigned OtherMulIdx = Idx+1; OtherMulIdx < Ops.size() && isa(Ops[OtherMulIdx]); ++OtherMulIdx) { const SCEVMulExpr *OtherMul = cast(Ops[OtherMulIdx]); // If MulOp occurs in OtherMul, we can fold the two multiplies // together. for (unsigned OMulOp = 0, e = OtherMul->getNumOperands(); OMulOp != e; ++OMulOp) if (OtherMul->getOperand(OMulOp) == MulOpSCEV) { // Fold X + (A*B*C) + (A*D*E) --> X + (A*(B*C+D*E)) const SCEV *InnerMul1 = Mul->getOperand(MulOp == 0); if (Mul->getNumOperands() != 2) { SmallVector MulOps( Mul->operands().take_front(MulOp)); append_range(MulOps, Mul->operands().drop_front(MulOp+1)); InnerMul1 = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1); } const SCEV *InnerMul2 = OtherMul->getOperand(OMulOp == 0); if (OtherMul->getNumOperands() != 2) { SmallVector MulOps( OtherMul->operands().take_front(OMulOp)); append_range(MulOps, OtherMul->operands().drop_front(OMulOp+1)); InnerMul2 = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1); } SmallVector TwoOps = {InnerMul1, InnerMul2}; const SCEV *InnerMulSum = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1); const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum, SCEV::FlagAnyWrap, Depth + 1); if (Ops.size() == 2) return OuterMul; Ops.erase(Ops.begin()+Idx); Ops.erase(Ops.begin()+OtherMulIdx-1); Ops.push_back(OuterMul); return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } } } } // If there are any add recurrences in the operands list, see if any other // added values are loop invariant. If so, we can fold them into the // recurrence. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr) ++Idx; // Scan over all recurrences, trying to fold loop invariants into them. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { // Scan all of the other operands to this add and add them to the vector if // they are loop invariant w.r.t. the recurrence. SmallVector LIOps; const SCEVAddRecExpr *AddRec = cast(Ops[Idx]); const Loop *AddRecLoop = AddRec->getLoop(); for (unsigned i = 0, e = Ops.size(); i != e; ++i) if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) { LIOps.push_back(Ops[i]); Ops.erase(Ops.begin()+i); --i; --e; } // If we found some loop invariants, fold them into the recurrence. if (!LIOps.empty()) { // Compute nowrap flags for the addition of the loop-invariant ops and // the addrec. Temporarily push it as an operand for that purpose. These // flags are valid in the scope of the addrec only. LIOps.push_back(AddRec); SCEV::NoWrapFlags Flags = ComputeFlags(LIOps); LIOps.pop_back(); // NLI + LI + {Start,+,Step} --> NLI + {LI+Start,+,Step} LIOps.push_back(AddRec->getStart()); SmallVector AddRecOps(AddRec->operands()); // It is not in general safe to propagate flags valid on an add within // the addrec scope to one outside it. We must prove that the inner // scope is guaranteed to execute if the outer one does to be able to // safely propagate. We know the program is undefined if poison is // produced on the inner scoped addrec. We also know that *for this use* // the outer scoped add can't overflow (because of the flags we just // computed for the inner scoped add) without the program being undefined. // Proving that entry to the outer scope neccesitates entry to the inner // scope, thus proves the program undefined if the flags would be violated // in the outer scope. SCEV::NoWrapFlags AddFlags = Flags; if (AddFlags != SCEV::FlagAnyWrap) { auto *DefI = getDefiningScopeBound(LIOps); auto *ReachI = &*AddRecLoop->getHeader()->begin(); if (!isGuaranteedToTransferExecutionTo(DefI, ReachI)) AddFlags = SCEV::FlagAnyWrap; } AddRecOps[0] = getAddExpr(LIOps, AddFlags, Depth + 1); // Build the new addrec. Propagate the NUW and NSW flags if both the // outer add and the inner addrec are guaranteed to have no overflow. // Always propagate NW. Flags = AddRec->getNoWrapFlags(setFlags(Flags, SCEV::FlagNW)); const SCEV *NewRec = getAddRecExpr(AddRecOps, AddRecLoop, Flags); // If all of the other operands were loop invariant, we are done. if (Ops.size() == 1) return NewRec; // Otherwise, add the folded AddRec by the non-invariant parts. for (unsigned i = 0;; ++i) if (Ops[i] == AddRec) { Ops[i] = NewRec; break; } return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } // Okay, if there weren't any loop invariants to be folded, check to see if // there are multiple AddRec's with the same loop induction variable being // added together. If so, we can fold them. for (unsigned OtherIdx = Idx+1; OtherIdx < Ops.size() && isa(Ops[OtherIdx]); ++OtherIdx) { // We expect the AddRecExpr's to be sorted in reverse dominance order, // so that the 1st found AddRecExpr is dominated by all others. assert(DT.dominates( cast(Ops[OtherIdx])->getLoop()->getHeader(), AddRec->getLoop()->getHeader()) && "AddRecExprs are not sorted in reverse dominance order?"); if (AddRecLoop == cast(Ops[OtherIdx])->getLoop()) { // Other + {A,+,B} + {C,+,D} --> Other + {A+C,+,B+D} SmallVector AddRecOps(AddRec->operands()); for (; OtherIdx != Ops.size() && isa(Ops[OtherIdx]); ++OtherIdx) { const auto *OtherAddRec = cast(Ops[OtherIdx]); if (OtherAddRec->getLoop() == AddRecLoop) { for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) { if (i >= AddRecOps.size()) { append_range(AddRecOps, OtherAddRec->operands().drop_front(i)); break; } SmallVector TwoOps = { AddRecOps[i], OtherAddRec->getOperand(i)}; AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1); } Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; } } // Step size has changed, so we cannot guarantee no self-wraparound. Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap); return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } } // Otherwise couldn't fold anything into this recurrence. Move onto the // next one. } // Okay, it looks like we really DO need an add expr. Check to see if we // already have one, otherwise create a new one. return getOrCreateAddExpr(Ops, ComputeFlags(Ops)); } const SCEV * ScalarEvolution::getOrCreateAddExpr(ArrayRef Ops, SCEV::NoWrapFlags Flags) { FoldingSetNodeID ID; ID.AddInteger(scAddExpr); for (const SCEV *Op : Ops) ID.AddPointer(Op); void *IP = nullptr; SCEVAddExpr *S = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); if (!S) { const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); S = new (SCEVAllocator) SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Ops); } S->setNoWrapFlags(Flags); return S; } const SCEV * ScalarEvolution::getOrCreateAddRecExpr(ArrayRef Ops, const Loop *L, SCEV::NoWrapFlags Flags) { FoldingSetNodeID ID; ID.AddInteger(scAddRecExpr); for (const SCEV *Op : Ops) ID.AddPointer(Op); ID.AddPointer(L); void *IP = nullptr; SCEVAddRecExpr *S = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); if (!S) { const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator), O, Ops.size(), L); UniqueSCEVs.InsertNode(S, IP); LoopUsers[L].push_back(S); registerUser(S, Ops); } setNoWrapFlags(S, Flags); return S; } const SCEV * ScalarEvolution::getOrCreateMulExpr(ArrayRef Ops, SCEV::NoWrapFlags Flags) { FoldingSetNodeID ID; ID.AddInteger(scMulExpr); for (const SCEV *Op : Ops) ID.AddPointer(Op); void *IP = nullptr; SCEVMulExpr *S = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); if (!S) { const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator), O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Ops); } S->setNoWrapFlags(Flags); return S; } static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) { uint64_t k = i*j; if (j > 1 && k / j != i) Overflow = true; return k; } /// Compute the result of "n choose k", the binomial coefficient. If an /// intermediate computation overflows, Overflow will be set and the return will /// be garbage. Overflow is not cleared on absence of overflow. static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) { // We use the multiplicative formula: // n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 . // At each iteration, we take the n-th term of the numeral and divide by the // (k-n)th term of the denominator. This division will always produce an // integral result, and helps reduce the chance of overflow in the // intermediate computations. However, we can still overflow even when the // final result would fit. if (n == 0 || n == k) return 1; if (k > n) return 0; if (k > n/2) k = n-k; uint64_t r = 1; for (uint64_t i = 1; i <= k; ++i) { r = umul_ov(r, n-(i-1), Overflow); r /= i; } return r; } /// Determine if any of the operands in this SCEV are a constant or if /// any of the add or multiply expressions in this SCEV contain a constant. static bool containsConstantInAddMulChain(const SCEV *StartExpr) { struct FindConstantInAddMulChain { bool FoundConstant = false; bool follow(const SCEV *S) { FoundConstant |= isa(S); return isa(S) || isa(S); } bool isDone() const { return FoundConstant; } }; FindConstantInAddMulChain F; SCEVTraversal ST(F); ST.visitAll(StartExpr); return F.FoundConstant; } /// Get a canonical multiply expression, or something simpler if possible. const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags OrigFlags, unsigned Depth) { assert(OrigFlags == maskFlags(OrigFlags, SCEV::FlagNUW | SCEV::FlagNSW) && "only nuw or nsw allowed"); assert(!Ops.empty() && "Cannot get empty mul!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = Ops[0]->getType(); assert(!ETy->isPointerTy()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(Ops[i]->getType() == ETy && "SCEVMulExpr operand types don't match!"); #endif // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI, DT); // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { ++Idx; assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! Ops[0] = getConstant(LHSC->getAPInt() * RHSC->getAPInt()); if (Ops.size() == 2) return Ops[0]; Ops.erase(Ops.begin()+1); // Erase the folded element LHSC = cast(Ops[0]); } // If we have a multiply of zero, it will always be zero. if (LHSC->getValue()->isZero()) return LHSC; // If we are left with a constant one being multiplied, strip it off. if (LHSC->getValue()->isOne()) { Ops.erase(Ops.begin()); --Idx; } if (Ops.size() == 1) return Ops[0]; } // Delay expensive flag strengthening until necessary. auto ComputeFlags = [this, OrigFlags](const ArrayRef Ops) { return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags); }; // Limit recursion calls depth. if (Depth > MaxArithDepth || hasHugeExpression(Ops)) return getOrCreateMulExpr(Ops, ComputeFlags(Ops)); if (SCEV *S = findExistingSCEVInCache(scMulExpr, Ops)) { // Don't strengthen flags if we have no new information. SCEVMulExpr *Mul = static_cast(S); if (Mul->getNoWrapFlags(OrigFlags) != OrigFlags) Mul->setNoWrapFlags(ComputeFlags(Ops)); return S; } if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { if (Ops.size() == 2) { // C1*(C2+V) -> C1*C2 + C1*V if (const SCEVAddExpr *Add = dyn_cast(Ops[1])) // If any of Add's ops are Adds or Muls with a constant, apply this // transformation as well. // // TODO: There are some cases where this transformation is not // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of // this transformation should be narrowed down. if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add)) { const SCEV *LHS = getMulExpr(LHSC, Add->getOperand(0), SCEV::FlagAnyWrap, Depth + 1); const SCEV *RHS = getMulExpr(LHSC, Add->getOperand(1), SCEV::FlagAnyWrap, Depth + 1); return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1); } if (Ops[0]->isAllOnesValue()) { // If we have a mul by -1 of an add, try distributing the -1 among the // add operands. if (const SCEVAddExpr *Add = dyn_cast(Ops[1])) { SmallVector NewOps; bool AnyFolded = false; for (const SCEV *AddOp : Add->operands()) { const SCEV *Mul = getMulExpr(Ops[0], AddOp, SCEV::FlagAnyWrap, Depth + 1); if (!isa(Mul)) AnyFolded = true; NewOps.push_back(Mul); } if (AnyFolded) return getAddExpr(NewOps, SCEV::FlagAnyWrap, Depth + 1); } else if (const auto *AddRec = dyn_cast(Ops[1])) { // Negation preserves a recurrence's no self-wrap property. SmallVector Operands; for (const SCEV *AddRecOp : AddRec->operands()) Operands.push_back(getMulExpr(Ops[0], AddRecOp, SCEV::FlagAnyWrap, Depth + 1)); // Let M be the minimum representable signed value. AddRec with nsw // multiplied by -1 can have signed overflow if and only if it takes a // value of M: M * (-1) would stay M and (M + 1) * (-1) would be the // maximum signed value. In all other cases signed overflow is // impossible. auto FlagsMask = SCEV::FlagNW; if (hasFlags(AddRec->getNoWrapFlags(), SCEV::FlagNSW)) { auto MinInt = APInt::getSignedMinValue(getTypeSizeInBits(AddRec->getType())); if (getSignedRangeMin(AddRec) != MinInt) FlagsMask = setFlags(FlagsMask, SCEV::FlagNSW); } return getAddRecExpr(Operands, AddRec->getLoop(), AddRec->getNoWrapFlags(FlagsMask)); } } } } // Skip over the add expression until we get to a multiply. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr) ++Idx; // If there are mul operands inline them all into this expression. if (Idx < Ops.size()) { bool DeletedMul = false; while (const SCEVMulExpr *Mul = dyn_cast(Ops[Idx])) { if (Ops.size() > MulOpsInlineThreshold) break; // If we have an mul, expand the mul operands onto the end of the // operands list. Ops.erase(Ops.begin()+Idx); append_range(Ops, Mul->operands()); DeletedMul = true; } // If we deleted at least one mul, we added operands to the end of the // list, and they are not necessarily sorted. Recurse to resort and // resimplify any operands we just acquired. if (DeletedMul) return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } // If there are any add recurrences in the operands list, see if any other // added values are loop invariant. If so, we can fold them into the // recurrence. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr) ++Idx; // Scan over all recurrences, trying to fold loop invariants into them. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { // Scan all of the other operands to this mul and add them to the vector // if they are loop invariant w.r.t. the recurrence. SmallVector LIOps; const SCEVAddRecExpr *AddRec = cast(Ops[Idx]); for (unsigned i = 0, e = Ops.size(); i != e; ++i) if (isAvailableAtLoopEntry(Ops[i], AddRec->getLoop())) { LIOps.push_back(Ops[i]); Ops.erase(Ops.begin()+i); --i; --e; } // If we found some loop invariants, fold them into the recurrence. if (!LIOps.empty()) { // NLI * LI * {Start,+,Step} --> NLI * {LI*Start,+,LI*Step} SmallVector NewOps; NewOps.reserve(AddRec->getNumOperands()); const SCEV *Scale = getMulExpr(LIOps, SCEV::FlagAnyWrap, Depth + 1); for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i), SCEV::FlagAnyWrap, Depth + 1)); // Build the new addrec. Propagate the NUW and NSW flags if both the // outer mul and the inner addrec are guaranteed to have no overflow. // // No self-wrap cannot be guaranteed after changing the step size, but // will be inferred if either NUW or NSW is true. SCEV::NoWrapFlags Flags = ComputeFlags({Scale, AddRec}); const SCEV *NewRec = getAddRecExpr( NewOps, AddRec->getLoop(), AddRec->getNoWrapFlags(Flags)); // If all of the other operands were loop invariant, we are done. if (Ops.size() == 1) return NewRec; // Otherwise, multiply the folded AddRec by the non-invariant parts. for (unsigned i = 0;; ++i) if (Ops[i] == AddRec) { Ops[i] = NewRec; break; } return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } // Okay, if there weren't any loop invariants to be folded, check to see // if there are multiple AddRec's with the same loop induction variable // being multiplied together. If so, we can fold them. // {A1,+,A2,+,...,+,An} * {B1,+,B2,+,...,+,Bn} // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [ // choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z // ]]],+,...up to x=2n}. // Note that the arguments to choose() are always integers with values // known at compile time, never SCEV objects. // // The implementation avoids pointless extra computations when the two // addrec's are of different length (mathematically, it's equivalent to // an infinite stream of zeros on the right). bool OpsModified = false; for (unsigned OtherIdx = Idx+1; OtherIdx != Ops.size() && isa(Ops[OtherIdx]); ++OtherIdx) { const SCEVAddRecExpr *OtherAddRec = dyn_cast(Ops[OtherIdx]); if (!OtherAddRec || OtherAddRec->getLoop() != AddRec->getLoop()) continue; // Limit max number of arguments to avoid creation of unreasonably big // SCEVAddRecs with very complex operands. if (AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1 > MaxAddRecSize || hasHugeExpression({AddRec, OtherAddRec})) continue; bool Overflow = false; Type *Ty = AddRec->getType(); bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64; SmallVector AddRecOps; for (int x = 0, xe = AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) { SmallVector SumOps; for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) { uint64_t Coeff1 = Choose(x, 2*x - y, Overflow); for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1), ze = std::min(x+1, (int)OtherAddRec->getNumOperands()); z < ze && !Overflow; ++z) { uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow); uint64_t Coeff; if (LargerThan64Bits) Coeff = umul_ov(Coeff1, Coeff2, Overflow); else Coeff = Coeff1*Coeff2; const SCEV *CoeffTerm = getConstant(Ty, Coeff); const SCEV *Term1 = AddRec->getOperand(y-z); const SCEV *Term2 = OtherAddRec->getOperand(z); SumOps.push_back(getMulExpr(CoeffTerm, Term1, Term2, SCEV::FlagAnyWrap, Depth + 1)); } } if (SumOps.empty()) SumOps.push_back(getZero(Ty)); AddRecOps.push_back(getAddExpr(SumOps, SCEV::FlagAnyWrap, Depth + 1)); } if (!Overflow) { const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(), SCEV::FlagAnyWrap); if (Ops.size() == 2) return NewAddRec; Ops[Idx] = NewAddRec; Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; OpsModified = true; AddRec = dyn_cast(NewAddRec); if (!AddRec) break; } } if (OpsModified) return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); // Otherwise couldn't fold anything into this recurrence. Move onto the // next one. } // Okay, it looks like we really DO need an mul expr. Check to see if we // already have one, otherwise create a new one. return getOrCreateMulExpr(Ops, ComputeFlags(Ops)); } /// Represents an unsigned remainder expression based on unsigned division. const SCEV *ScalarEvolution::getURemExpr(const SCEV *LHS, const SCEV *RHS) { assert(getEffectiveSCEVType(LHS->getType()) == getEffectiveSCEVType(RHS->getType()) && "SCEVURemExpr operand types don't match!"); // Short-circuit easy cases if (const SCEVConstant *RHSC = dyn_cast(RHS)) { // If constant is one, the result is trivial if (RHSC->getValue()->isOne()) return getZero(LHS->getType()); // X urem 1 --> 0 // If constant is a power of two, fold into a zext(trunc(LHS)). if (RHSC->getAPInt().isPowerOf2()) { Type *FullTy = LHS->getType(); Type *TruncTy = IntegerType::get(getContext(), RHSC->getAPInt().logBase2()); return getZeroExtendExpr(getTruncateExpr(LHS, TruncTy), FullTy); } } // Fallback to %a == %x urem %y == %x - ((%x udiv %y) * %y) const SCEV *UDiv = getUDivExpr(LHS, RHS); const SCEV *Mult = getMulExpr(UDiv, RHS, SCEV::FlagNUW); return getMinusSCEV(LHS, Mult, SCEV::FlagNUW); } /// Get a canonical unsigned division expression, or something simpler if /// possible. const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, const SCEV *RHS) { assert(!LHS->getType()->isPointerTy() && "SCEVUDivExpr operand can't be pointer!"); assert(LHS->getType() == RHS->getType() && "SCEVUDivExpr operand types don't match!"); FoldingSetNodeID ID; ID.AddInteger(scUDivExpr); ID.AddPointer(LHS); ID.AddPointer(RHS); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // 0 udiv Y == 0 if (const SCEVConstant *LHSC = dyn_cast(LHS)) if (LHSC->getValue()->isZero()) return LHS; if (const SCEVConstant *RHSC = dyn_cast(RHS)) { if (RHSC->getValue()->isOne()) return LHS; // X udiv 1 --> x // If the denominator is zero, the result of the udiv is undefined. Don't // try to analyze it, because the resolution chosen here may differ from // the resolution chosen in other parts of the compiler. if (!RHSC->getValue()->isZero()) { // Determine if the division can be folded into the operands of // its operands. // TODO: Generalize this to non-constants by using known-bits information. Type *Ty = LHS->getType(); unsigned LZ = RHSC->getAPInt().countl_zero(); unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1; // For non-power-of-two values, effectively round the value up to the // nearest power of two. if (!RHSC->getAPInt().isPowerOf2()) ++MaxShiftAmt; IntegerType *ExtTy = IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt); if (const SCEVAddRecExpr *AR = dyn_cast(LHS)) if (const SCEVConstant *Step = dyn_cast(AR->getStepRecurrence(*this))) { // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded. const APInt &StepInt = Step->getAPInt(); const APInt &DivInt = RHSC->getAPInt(); if (!StepInt.urem(DivInt) && getZeroExtendExpr(AR, ExtTy) == getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), getZeroExtendExpr(Step, ExtTy), AR->getLoop(), SCEV::FlagAnyWrap)) { SmallVector Operands; for (const SCEV *Op : AR->operands()) Operands.push_back(getUDivExpr(Op, RHS)); return getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagNW); } /// Get a canonical UDivExpr for a recurrence. /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0. // We can currently only fold X%N if X is constant. const SCEVConstant *StartC = dyn_cast(AR->getStart()); if (StartC && !DivInt.urem(StepInt) && getZeroExtendExpr(AR, ExtTy) == getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), getZeroExtendExpr(Step, ExtTy), AR->getLoop(), SCEV::FlagAnyWrap)) { const APInt &StartInt = StartC->getAPInt(); const APInt &StartRem = StartInt.urem(StepInt); if (StartRem != 0) { const SCEV *NewLHS = getAddRecExpr(getConstant(StartInt - StartRem), Step, AR->getLoop(), SCEV::FlagNW); if (LHS != NewLHS) { LHS = NewLHS; // Reset the ID to include the new LHS, and check if it is // already cached. ID.clear(); ID.AddInteger(scUDivExpr); ID.AddPointer(LHS); ID.AddPointer(RHS); IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; } } } } // (A*B)/C --> A*(B/C) if safe and B/C can be folded. if (const SCEVMulExpr *M = dyn_cast(LHS)) { SmallVector Operands; for (const SCEV *Op : M->operands()) Operands.push_back(getZeroExtendExpr(Op, ExtTy)); if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands)) // Find an operand that's safely divisible. for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) { const SCEV *Op = M->getOperand(i); const SCEV *Div = getUDivExpr(Op, RHSC); if (!isa(Div) && getMulExpr(Div, RHSC) == Op) { Operands = SmallVector(M->operands()); Operands[i] = Div; return getMulExpr(Operands); } } } // (A/B)/C --> A/(B*C) if safe and B*C can be folded. if (const SCEVUDivExpr *OtherDiv = dyn_cast(LHS)) { if (auto *DivisorConstant = dyn_cast(OtherDiv->getRHS())) { bool Overflow = false; APInt NewRHS = DivisorConstant->getAPInt().umul_ov(RHSC->getAPInt(), Overflow); if (Overflow) { return getConstant(RHSC->getType(), 0, false); } return getUDivExpr(OtherDiv->getLHS(), getConstant(NewRHS)); } } // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded. if (const SCEVAddExpr *A = dyn_cast(LHS)) { SmallVector Operands; for (const SCEV *Op : A->operands()) Operands.push_back(getZeroExtendExpr(Op, ExtTy)); if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) { Operands.clear(); for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) { const SCEV *Op = getUDivExpr(A->getOperand(i), RHS); if (isa(Op) || getMulExpr(Op, RHS) != A->getOperand(i)) break; Operands.push_back(Op); } if (Operands.size() == A->getNumOperands()) return getAddExpr(Operands); } } // Fold if both operands are constant. if (const SCEVConstant *LHSC = dyn_cast(LHS)) return getConstant(LHSC->getAPInt().udiv(RHSC->getAPInt())); } } // The Insertion Point (IP) might be invalid by now (due to UniqueSCEVs // changes). Make sure we get a new one. IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator), LHS, RHS); UniqueSCEVs.InsertNode(S, IP); registerUser(S, {LHS, RHS}); return S; } APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) { APInt A = C1->getAPInt().abs(); APInt B = C2->getAPInt().abs(); uint32_t ABW = A.getBitWidth(); uint32_t BBW = B.getBitWidth(); if (ABW > BBW) B = B.zext(ABW); else if (ABW < BBW) A = A.zext(BBW); return APIntOps::GreatestCommonDivisor(std::move(A), std::move(B)); } /// Get a canonical unsigned division expression, or something simpler if /// possible. There is no representation for an exact udiv in SCEV IR, but we /// can attempt to remove factors from the LHS and RHS. We can't do this when /// it's not exact because the udiv may be clearing bits. const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS, const SCEV *RHS) { // TODO: we could try to find factors in all sorts of things, but for now we // just deal with u/exact (multiply, constant). See SCEVDivision towards the // end of this file for inspiration. const SCEVMulExpr *Mul = dyn_cast(LHS); if (!Mul || !Mul->hasNoUnsignedWrap()) return getUDivExpr(LHS, RHS); if (const SCEVConstant *RHSCst = dyn_cast(RHS)) { // If the mulexpr multiplies by a constant, then that constant must be the // first element of the mulexpr. if (const auto *LHSCst = dyn_cast(Mul->getOperand(0))) { if (LHSCst == RHSCst) { SmallVector Operands(drop_begin(Mul->operands())); return getMulExpr(Operands); } // We can't just assume that LHSCst divides RHSCst cleanly, it could be // that there's a factor provided by one of the other terms. We need to // check. APInt Factor = gcd(LHSCst, RHSCst); if (!Factor.isIntN(1)) { LHSCst = cast(getConstant(LHSCst->getAPInt().udiv(Factor))); RHSCst = cast(getConstant(RHSCst->getAPInt().udiv(Factor))); SmallVector Operands; Operands.push_back(LHSCst); append_range(Operands, Mul->operands().drop_front()); LHS = getMulExpr(Operands); RHS = RHSCst; Mul = dyn_cast(LHS); if (!Mul) return getUDivExactExpr(LHS, RHS); } } } for (int i = 0, e = Mul->getNumOperands(); i != e; ++i) { if (Mul->getOperand(i) == RHS) { SmallVector Operands; append_range(Operands, Mul->operands().take_front(i)); append_range(Operands, Mul->operands().drop_front(i + 1)); return getMulExpr(Operands); } } return getUDivExpr(LHS, RHS); } /// Get an add recurrence expression for the specified loop. Simplify the /// expression as much as possible. const SCEV *ScalarEvolution::getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags) { SmallVector Operands; Operands.push_back(Start); if (const SCEVAddRecExpr *StepChrec = dyn_cast(Step)) if (StepChrec->getLoop() == L) { append_range(Operands, StepChrec->operands()); return getAddRecExpr(Operands, L, maskFlags(Flags, SCEV::FlagNW)); } Operands.push_back(Step); return getAddRecExpr(Operands, L, Flags); } /// Get an add recurrence expression for the specified loop. Simplify the /// expression as much as possible. const SCEV * ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, const Loop *L, SCEV::NoWrapFlags Flags) { if (Operands.size() == 1) return Operands[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Operands[0]->getType()); for (unsigned i = 1, e = Operands.size(); i != e; ++i) { assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy && "SCEVAddRecExpr operand types don't match!"); assert(!Operands[i]->getType()->isPointerTy() && "Step must be integer"); } for (unsigned i = 0, e = Operands.size(); i != e; ++i) assert(isLoopInvariant(Operands[i], L) && "SCEVAddRecExpr operand is not loop-invariant!"); #endif if (Operands.back()->isZero()) { Operands.pop_back(); return getAddRecExpr(Operands, L, SCEV::FlagAnyWrap); // {X,+,0} --> X } // It's tempting to want to call getConstantMaxBackedgeTakenCount count here and // use that information to infer NUW and NSW flags. However, computing a // BE count requires calling getAddRecExpr, so we may not yet have a // meaningful BE count at this point (and if we don't, we'd be stuck // with a SCEVCouldNotCompute as the cached BE count). Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags); // Canonicalize nested AddRecs in by nesting them in order of loop depth. if (const SCEVAddRecExpr *NestedAR = dyn_cast(Operands[0])) { const Loop *NestedLoop = NestedAR->getLoop(); if (L->contains(NestedLoop) ? (L->getLoopDepth() < NestedLoop->getLoopDepth()) : (!NestedLoop->contains(L) && DT.dominates(L->getHeader(), NestedLoop->getHeader()))) { SmallVector NestedOperands(NestedAR->operands()); Operands[0] = NestedAR->getStart(); // AddRecs require their operands be loop-invariant with respect to their // loops. Don't perform this transformation if it would break this // requirement. bool AllInvariant = all_of( Operands, [&](const SCEV *Op) { return isLoopInvariant(Op, L); }); if (AllInvariant) { // Create a recurrence for the outer loop with the same step size. // // The outer recurrence keeps its NW flag but only keeps NUW/NSW if the // inner recurrence has the same property. SCEV::NoWrapFlags OuterFlags = maskFlags(Flags, SCEV::FlagNW | NestedAR->getNoWrapFlags()); NestedOperands[0] = getAddRecExpr(Operands, L, OuterFlags); AllInvariant = all_of(NestedOperands, [&](const SCEV *Op) { return isLoopInvariant(Op, NestedLoop); }); if (AllInvariant) { // Ok, both add recurrences are valid after the transformation. // // The inner recurrence keeps its NW flag but only keeps NUW/NSW if // the outer recurrence has the same property. SCEV::NoWrapFlags InnerFlags = maskFlags(NestedAR->getNoWrapFlags(), SCEV::FlagNW | Flags); return getAddRecExpr(NestedOperands, NestedLoop, InnerFlags); } } // Reset Operands to its original state. Operands[0] = NestedAR; } } // Okay, it looks like we really DO need an addrec expr. Check to see if we // already have one, otherwise create a new one. return getOrCreateAddRecExpr(Operands, L, Flags); } const SCEV * ScalarEvolution::getGEPExpr(GEPOperator *GEP, const SmallVectorImpl &IndexExprs) { const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand()); // getSCEV(Base)->getType() has the same address space as Base->getType() // because SCEV::getType() preserves the address space. Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType()); const bool AssumeInBoundsFlags = [&]() { if (!GEP->isInBounds()) return false; // We'd like to propagate flags from the IR to the corresponding SCEV nodes, // but to do that, we have to ensure that said flag is valid in the entire // defined scope of the SCEV. auto *GEPI = dyn_cast(GEP); // TODO: non-instructions have global scope. We might be able to prove // some global scope cases return GEPI && isSCEVExprNeverPoison(GEPI); }(); SCEV::NoWrapFlags OffsetWrap = AssumeInBoundsFlags ? SCEV::FlagNSW : SCEV::FlagAnyWrap; Type *CurTy = GEP->getType(); bool FirstIter = true; SmallVector Offsets; for (const SCEV *IndexExpr : IndexExprs) { // Compute the (potentially symbolic) offset in bytes for this index. if (StructType *STy = dyn_cast(CurTy)) { // For a struct, add the member offset. ConstantInt *Index = cast(IndexExpr)->getValue(); unsigned FieldNo = Index->getZExtValue(); const SCEV *FieldOffset = getOffsetOfExpr(IntIdxTy, STy, FieldNo); Offsets.push_back(FieldOffset); // Update CurTy to the type of the field at Index. CurTy = STy->getTypeAtIndex(Index); } else { // Update CurTy to its element type. if (FirstIter) { assert(isa(CurTy) && "The first index of a GEP indexes a pointer"); CurTy = GEP->getSourceElementType(); FirstIter = false; } else { CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0); } // For an array, add the element offset, explicitly scaled. const SCEV *ElementSize = getSizeOfExpr(IntIdxTy, CurTy); // Getelementptr indices are signed. IndexExpr = getTruncateOrSignExtend(IndexExpr, IntIdxTy); // Multiply the index by the element size to compute the element offset. const SCEV *LocalOffset = getMulExpr(IndexExpr, ElementSize, OffsetWrap); Offsets.push_back(LocalOffset); } } // Handle degenerate case of GEP without offsets. if (Offsets.empty()) return BaseExpr; // Add the offsets together, assuming nsw if inbounds. const SCEV *Offset = getAddExpr(Offsets, OffsetWrap); // Add the base address and the offset. We cannot use the nsw flag, as the // base address is unsigned. However, if we know that the offset is // non-negative, we can use nuw. SCEV::NoWrapFlags BaseWrap = AssumeInBoundsFlags && isKnownNonNegative(Offset) ? SCEV::FlagNUW : SCEV::FlagAnyWrap; auto *GEPExpr = getAddExpr(BaseExpr, Offset, BaseWrap); assert(BaseExpr->getType() == GEPExpr->getType() && "GEP should not change type mid-flight."); return GEPExpr; } SCEV *ScalarEvolution::findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef Ops) { FoldingSetNodeID ID; ID.AddInteger(SCEVType); for (const SCEV *Op : Ops) ID.AddPointer(Op); void *IP = nullptr; return UniqueSCEVs.FindNodeOrInsertPos(ID, IP); } const SCEV *ScalarEvolution::getAbsExpr(const SCEV *Op, bool IsNSW) { SCEV::NoWrapFlags Flags = IsNSW ? SCEV::FlagNSW : SCEV::FlagAnyWrap; return getSMaxExpr(Op, getNegativeSCEV(Op, Flags)); } const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind, SmallVectorImpl &Ops) { assert(SCEVMinMaxExpr::isMinMaxType(Kind) && "Not a SCEVMinMaxExpr!"); assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) { assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "Operand types don't match!"); assert(Ops[0]->getType()->isPointerTy() == Ops[i]->getType()->isPointerTy() && "min/max should be consistently pointerish"); } #endif bool IsSigned = Kind == scSMaxExpr || Kind == scSMinExpr; bool IsMax = Kind == scSMaxExpr || Kind == scUMaxExpr; // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI, DT); // Check if we have created the same expression before. if (const SCEV *S = findExistingSCEVInCache(Kind, Ops)) { return S; } // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { ++Idx; assert(Idx < Ops.size()); auto FoldOp = [&](const APInt &LHS, const APInt &RHS) { switch (Kind) { case scSMaxExpr: return APIntOps::smax(LHS, RHS); case scSMinExpr: return APIntOps::smin(LHS, RHS); case scUMaxExpr: return APIntOps::umax(LHS, RHS); case scUMinExpr: return APIntOps::umin(LHS, RHS); default: llvm_unreachable("Unknown SCEV min/max opcode"); } }; while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! ConstantInt *Fold = ConstantInt::get( getContext(), FoldOp(LHSC->getAPInt(), RHSC->getAPInt())); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; LHSC = cast(Ops[0]); } bool IsMinV = LHSC->getValue()->isMinValue(IsSigned); bool IsMaxV = LHSC->getValue()->isMaxValue(IsSigned); if (IsMax ? IsMinV : IsMaxV) { // If we are left with a constant minimum(/maximum)-int, strip it off. Ops.erase(Ops.begin()); --Idx; } else if (IsMax ? IsMaxV : IsMinV) { // If we have a max(/min) with a constant maximum(/minimum)-int, // it will always be the extremum. return LHSC; } if (Ops.size() == 1) return Ops[0]; } // Find the first operation of the same kind while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < Kind) ++Idx; // Check to see if one of the operands is of the same kind. If so, expand its // operands onto our operand list, and recurse to simplify. if (Idx < Ops.size()) { bool DeletedAny = false; while (Ops[Idx]->getSCEVType() == Kind) { const SCEVMinMaxExpr *SMME = cast(Ops[Idx]); Ops.erase(Ops.begin()+Idx); append_range(Ops, SMME->operands()); DeletedAny = true; } if (DeletedAny) return getMinMaxExpr(Kind, Ops); } // Okay, check to see if the same value occurs in the operand list twice. If // so, delete one. Since we sorted the list, these values are required to // be adjacent. llvm::CmpInst::Predicate GEPred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; llvm::CmpInst::Predicate LEPred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; llvm::CmpInst::Predicate FirstPred = IsMax ? GEPred : LEPred; llvm::CmpInst::Predicate SecondPred = IsMax ? LEPred : GEPred; for (unsigned i = 0, e = Ops.size() - 1; i != e; ++i) { if (Ops[i] == Ops[i + 1] || isKnownViaNonRecursiveReasoning(FirstPred, Ops[i], Ops[i + 1])) { // X op Y op Y --> X op Y // X op Y --> X, if we know X, Y are ordered appropriately Ops.erase(Ops.begin() + i + 1, Ops.begin() + i + 2); --i; --e; } else if (isKnownViaNonRecursiveReasoning(SecondPred, Ops[i], Ops[i + 1])) { // X op Y --> Y, if we know X, Y are ordered appropriately Ops.erase(Ops.begin() + i, Ops.begin() + i + 1); --i; --e; } } if (Ops.size() == 1) return Ops[0]; assert(!Ops.empty() && "Reduced smax down to nothing!"); // Okay, it looks like we really DO need an expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(Kind); for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = nullptr; const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP); if (ExistingSCEV) return ExistingSCEV; const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); SCEV *S = new (SCEVAllocator) SCEVMinMaxExpr(ID.Intern(SCEVAllocator), Kind, O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Ops); return S; } namespace { class SCEVSequentialMinMaxDeduplicatingVisitor final : public SCEVVisitor> { using RetVal = std::optional; using Base = SCEVVisitor; ScalarEvolution &SE; const SCEVTypes RootKind; // Must be a sequential min/max expression. const SCEVTypes NonSequentialRootKind; // Non-sequential variant of RootKind. SmallPtrSet SeenOps; bool canRecurseInto(SCEVTypes Kind) const { // We can only recurse into the SCEV expression of the same effective type // as the type of our root SCEV expression. return RootKind == Kind || NonSequentialRootKind == Kind; }; RetVal visitAnyMinMaxExpr(const SCEV *S) { assert((isa(S) || isa(S)) && "Only for min/max expressions."); SCEVTypes Kind = S->getSCEVType(); if (!canRecurseInto(Kind)) return S; auto *NAry = cast(S); SmallVector NewOps; bool Changed = visit(Kind, NAry->operands(), NewOps); if (!Changed) return S; if (NewOps.empty()) return std::nullopt; return isa(S) ? SE.getSequentialMinMaxExpr(Kind, NewOps) : SE.getMinMaxExpr(Kind, NewOps); } RetVal visit(const SCEV *S) { // Has the whole operand been seen already? if (!SeenOps.insert(S).second) return std::nullopt; return Base::visit(S); } public: SCEVSequentialMinMaxDeduplicatingVisitor(ScalarEvolution &SE, SCEVTypes RootKind) : SE(SE), RootKind(RootKind), NonSequentialRootKind( SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType( RootKind)) {} bool /*Changed*/ visit(SCEVTypes Kind, ArrayRef OrigOps, SmallVectorImpl &NewOps) { bool Changed = false; SmallVector Ops; Ops.reserve(OrigOps.size()); for (const SCEV *Op : OrigOps) { RetVal NewOp = visit(Op); if (NewOp != Op) Changed = true; if (NewOp) Ops.emplace_back(*NewOp); } if (Changed) NewOps = std::move(Ops); return Changed; } RetVal visitConstant(const SCEVConstant *Constant) { return Constant; } RetVal visitVScale(const SCEVVScale *VScale) { return VScale; } RetVal visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { return Expr; } RetVal visitTruncateExpr(const SCEVTruncateExpr *Expr) { return Expr; } RetVal visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { return Expr; } RetVal visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { return Expr; } RetVal visitAddExpr(const SCEVAddExpr *Expr) { return Expr; } RetVal visitMulExpr(const SCEVMulExpr *Expr) { return Expr; } RetVal visitUDivExpr(const SCEVUDivExpr *Expr) { return Expr; } RetVal visitAddRecExpr(const SCEVAddRecExpr *Expr) { return Expr; } RetVal visitSMaxExpr(const SCEVSMaxExpr *Expr) { return visitAnyMinMaxExpr(Expr); } RetVal visitUMaxExpr(const SCEVUMaxExpr *Expr) { return visitAnyMinMaxExpr(Expr); } RetVal visitSMinExpr(const SCEVSMinExpr *Expr) { return visitAnyMinMaxExpr(Expr); } RetVal visitUMinExpr(const SCEVUMinExpr *Expr) { return visitAnyMinMaxExpr(Expr); } RetVal visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) { return visitAnyMinMaxExpr(Expr); } RetVal visitUnknown(const SCEVUnknown *Expr) { return Expr; } RetVal visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { return Expr; } }; } // namespace static bool scevUnconditionallyPropagatesPoisonFromOperands(SCEVTypes Kind) { switch (Kind) { case scConstant: case scVScale: case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: case scAddExpr: case scMulExpr: case scUDivExpr: case scAddRecExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scUnknown: // If any operand is poison, the whole expression is poison. return true; case scSequentialUMinExpr: // FIXME: if the *first* operand is poison, the whole expression is poison. return false; // Pessimistically, say that it does not propagate poison. case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } /// Return true if V is poison given that AssumedPoison is already poison. static bool impliesPoison(const SCEV *AssumedPoison, const SCEV *S) { // The only way poison may be introduced in a SCEV expression is from a // poison SCEVUnknown (ConstantExprs are also represented as SCEVUnknown, // not SCEVConstant). Notably, nowrap flags in SCEV nodes can *not* // introduce poison -- they encode guaranteed, non-speculated knowledge. // // Additionally, all SCEV nodes propagate poison from inputs to outputs, // with the notable exception of umin_seq, where only poison from the first // operand is (unconditionally) propagated. struct SCEVPoisonCollector { bool LookThroughMaybePoisonBlocking; SmallPtrSet MaybePoison; SCEVPoisonCollector(bool LookThroughMaybePoisonBlocking) : LookThroughMaybePoisonBlocking(LookThroughMaybePoisonBlocking) {} bool follow(const SCEV *S) { if (!LookThroughMaybePoisonBlocking && !scevUnconditionallyPropagatesPoisonFromOperands(S->getSCEVType())) return false; if (auto *SU = dyn_cast(S)) { if (!isGuaranteedNotToBePoison(SU->getValue())) MaybePoison.insert(S); } return true; } bool isDone() const { return false; } }; // First collect all SCEVs that might result in AssumedPoison to be poison. // We need to look through potentially poison-blocking operations here, // because we want to find all SCEVs that *might* result in poison, not only // those that are *required* to. SCEVPoisonCollector PC1(/* LookThroughMaybePoisonBlocking */ true); visitAll(AssumedPoison, PC1); // AssumedPoison is never poison. As the assumption is false, the implication // is true. Don't bother walking the other SCEV in this case. if (PC1.MaybePoison.empty()) return true; // Collect all SCEVs in S that, if poison, *will* result in S being poison // as well. We cannot look through potentially poison-blocking operations // here, as their arguments only *may* make the result poison. SCEVPoisonCollector PC2(/* LookThroughMaybePoisonBlocking */ false); visitAll(S, PC2); // Make sure that no matter which SCEV in PC1.MaybePoison is actually poison, // it will also make S poison by being part of PC2.MaybePoison. return all_of(PC1.MaybePoison, [&](const SCEV *S) { return PC2.MaybePoison.contains(S); }); } const SCEV * ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind, SmallVectorImpl &Ops) { assert(SCEVSequentialMinMaxExpr::isSequentialMinMaxType(Kind) && "Not a SCEVSequentialMinMaxExpr!"); assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) { assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "Operand types don't match!"); assert(Ops[0]->getType()->isPointerTy() == Ops[i]->getType()->isPointerTy() && "min/max should be consistently pointerish"); } #endif // Note that SCEVSequentialMinMaxExpr is *NOT* commutative, // so we can *NOT* do any kind of sorting of the expressions! // Check if we have created the same expression before. if (const SCEV *S = findExistingSCEVInCache(Kind, Ops)) return S; // FIXME: there are *some* simplifications that we can do here. // Keep only the first instance of an operand. { SCEVSequentialMinMaxDeduplicatingVisitor Deduplicator(*this, Kind); bool Changed = Deduplicator.visit(Kind, Ops, Ops); if (Changed) return getSequentialMinMaxExpr(Kind, Ops); } // Check to see if one of the operands is of the same kind. If so, expand its // operands onto our operand list, and recurse to simplify. { unsigned Idx = 0; bool DeletedAny = false; while (Idx < Ops.size()) { if (Ops[Idx]->getSCEVType() != Kind) { ++Idx; continue; } const auto *SMME = cast(Ops[Idx]); Ops.erase(Ops.begin() + Idx); Ops.insert(Ops.begin() + Idx, SMME->operands().begin(), SMME->operands().end()); DeletedAny = true; } if (DeletedAny) return getSequentialMinMaxExpr(Kind, Ops); } const SCEV *SaturationPoint; ICmpInst::Predicate Pred; switch (Kind) { case scSequentialUMinExpr: SaturationPoint = getZero(Ops[0]->getType()); Pred = ICmpInst::ICMP_ULE; break; default: llvm_unreachable("Not a sequential min/max type."); } for (unsigned i = 1, e = Ops.size(); i != e; ++i) { // We can replace %x umin_seq %y with %x umin %y if either: // * %y being poison implies %x is also poison. // * %x cannot be the saturating value (e.g. zero for umin). if (::impliesPoison(Ops[i], Ops[i - 1]) || isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_NE, Ops[i - 1], SaturationPoint)) { SmallVector SeqOps = {Ops[i - 1], Ops[i]}; Ops[i - 1] = getMinMaxExpr( SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(Kind), SeqOps); Ops.erase(Ops.begin() + i); return getSequentialMinMaxExpr(Kind, Ops); } // Fold %x umin_seq %y to %x if %x ule %y. // TODO: We might be able to prove the predicate for a later operand. if (isKnownViaNonRecursiveReasoning(Pred, Ops[i - 1], Ops[i])) { Ops.erase(Ops.begin() + i); return getSequentialMinMaxExpr(Kind, Ops); } } // Okay, it looks like we really DO need an expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(Kind); for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = nullptr; const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP); if (ExistingSCEV) return ExistingSCEV; const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); SCEV *S = new (SCEVAllocator) SCEVSequentialMinMaxExpr(ID.Intern(SCEVAllocator), Kind, O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); registerUser(S, Ops); return S; } const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS, const SCEV *RHS) { SmallVector Ops = {LHS, RHS}; return getSMaxExpr(Ops); } const SCEV *ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { return getMinMaxExpr(scSMaxExpr, Ops); } const SCEV *ScalarEvolution::getUMaxExpr(const SCEV *LHS, const SCEV *RHS) { SmallVector Ops = {LHS, RHS}; return getUMaxExpr(Ops); } const SCEV *ScalarEvolution::getUMaxExpr(SmallVectorImpl &Ops) { return getMinMaxExpr(scUMaxExpr, Ops); } const SCEV *ScalarEvolution::getSMinExpr(const SCEV *LHS, const SCEV *RHS) { SmallVector Ops = { LHS, RHS }; return getSMinExpr(Ops); } const SCEV *ScalarEvolution::getSMinExpr(SmallVectorImpl &Ops) { return getMinMaxExpr(scSMinExpr, Ops); } const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential) { SmallVector Ops = { LHS, RHS }; return getUMinExpr(Ops, Sequential); } const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl &Ops, bool Sequential) { return Sequential ? getSequentialMinMaxExpr(scSequentialUMinExpr, Ops) : getMinMaxExpr(scUMinExpr, Ops); } const SCEV * ScalarEvolution::getSizeOfExpr(Type *IntTy, TypeSize Size) { const SCEV *Res = getConstant(IntTy, Size.getKnownMinValue()); if (Size.isScalable()) Res = getMulExpr(Res, getVScale(IntTy)); return Res; } const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) { return getSizeOfExpr(IntTy, getDataLayout().getTypeAllocSize(AllocTy)); } const SCEV *ScalarEvolution::getStoreSizeOfExpr(Type *IntTy, Type *StoreTy) { return getSizeOfExpr(IntTy, getDataLayout().getTypeStoreSize(StoreTy)); } const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy, StructType *STy, unsigned FieldNo) { // We can bypass creating a target-independent constant expression and then // folding it back into a ConstantInt. This is just a compile-time // optimization. const StructLayout *SL = getDataLayout().getStructLayout(STy); assert(!SL->getSizeInBits().isScalable() && "Cannot get offset for structure containing scalable vector types"); return getConstant(IntTy, SL->getElementOffset(FieldNo)); } const SCEV *ScalarEvolution::getUnknown(Value *V) { // Don't attempt to do anything other than create a SCEVUnknown object // here. createSCEV only calls getUnknown after checking for all other // interesting possibilities, and any other code that calls getUnknown // is doing so in order to hide a value from SCEV canonicalization. FoldingSetNodeID ID; ID.AddInteger(scUnknown); ID.AddPointer(V); void *IP = nullptr; if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) { assert(cast(S)->getValue() == V && "Stale SCEVUnknown in uniquing map!"); return S; } SCEV *S = new (SCEVAllocator) SCEVUnknown(ID.Intern(SCEVAllocator), V, this, FirstUnknown); FirstUnknown = cast(S); UniqueSCEVs.InsertNode(S, IP); return S; } //===----------------------------------------------------------------------===// // Basic SCEV Analysis and PHI Idiom Recognition Code // /// Test if values of the given type are analyzable within the SCEV /// framework. This primarily includes integer types, and it can optionally /// include pointer types if the ScalarEvolution class has access to /// target-specific information. bool ScalarEvolution::isSCEVable(Type *Ty) const { // Integers and pointers are always SCEVable. return Ty->isIntOrPtrTy(); } /// Return the size in bits of the specified type, for which isSCEVable must /// return true. uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const { assert(isSCEVable(Ty) && "Type is not SCEVable!"); if (Ty->isPointerTy()) return getDataLayout().getIndexTypeSizeInBits(Ty); return getDataLayout().getTypeSizeInBits(Ty); } /// Return a type with the same bitwidth as the given type and which represents /// how SCEV will treat the given type, for which isSCEVable must return /// true. For pointer types, this is the pointer index sized integer type. Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const { assert(isSCEVable(Ty) && "Type is not SCEVable!"); if (Ty->isIntegerTy()) return Ty; // The only other support type is pointer. assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!"); return getDataLayout().getIndexType(Ty); } Type *ScalarEvolution::getWiderType(Type *T1, Type *T2) const { return getTypeSizeInBits(T1) >= getTypeSizeInBits(T2) ? T1 : T2; } bool ScalarEvolution::instructionCouldExistWitthOperands(const SCEV *A, const SCEV *B) { /// For a valid use point to exist, the defining scope of one operand /// must dominate the other. bool PreciseA, PreciseB; auto *ScopeA = getDefiningScopeBound({A}, PreciseA); auto *ScopeB = getDefiningScopeBound({B}, PreciseB); if (!PreciseA || !PreciseB) // Can't tell. return false; return (ScopeA == ScopeB) || DT.dominates(ScopeA, ScopeB) || DT.dominates(ScopeB, ScopeA); } const SCEV *ScalarEvolution::getCouldNotCompute() { return CouldNotCompute.get(); } bool ScalarEvolution::checkValidity(const SCEV *S) const { bool ContainsNulls = SCEVExprContains(S, [](const SCEV *S) { auto *SU = dyn_cast(S); return SU && SU->getValue() == nullptr; }); return !ContainsNulls; } bool ScalarEvolution::containsAddRecurrence(const SCEV *S) { HasRecMapType::iterator I = HasRecMap.find(S); if (I != HasRecMap.end()) return I->second; bool FoundAddRec = SCEVExprContains(S, [](const SCEV *S) { return isa(S); }); HasRecMap.insert({S, FoundAddRec}); return FoundAddRec; } /// Return the ValueOffsetPair set for \p S. \p S can be represented /// by the value and offset from any ValueOffsetPair in the set. ArrayRef ScalarEvolution::getSCEVValues(const SCEV *S) { ExprValueMapType::iterator SI = ExprValueMap.find_as(S); if (SI == ExprValueMap.end()) return std::nullopt; return SI->second.getArrayRef(); } /// Erase Value from ValueExprMap and ExprValueMap. ValueExprMap.erase(V) /// cannot be used separately. eraseValueFromMap should be used to remove /// V from ValueExprMap and ExprValueMap at the same time. void ScalarEvolution::eraseValueFromMap(Value *V) { ValueExprMapType::iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) { auto EVIt = ExprValueMap.find(I->second); bool Removed = EVIt->second.remove(V); (void) Removed; assert(Removed && "Value not in ExprValueMap?"); ValueExprMap.erase(I); } } void ScalarEvolution::insertValueToMap(Value *V, const SCEV *S) { // A recursive query may have already computed the SCEV. It should be // equivalent, but may not necessarily be exactly the same, e.g. due to lazily // inferred nowrap flags. auto It = ValueExprMap.find_as(V); if (It == ValueExprMap.end()) { ValueExprMap.insert({SCEVCallbackVH(V, this), S}); ExprValueMap[S].insert(V); } } /// Determine whether this instruction is either not SCEVable or will always /// produce a SCEVUnknown. We do not have to walk past such instructions when /// invalidating. static bool isAlwaysUnknown(const Instruction *I) { switch (I->getOpcode()) { case Instruction::Load: return true; default: return false; } } /// Return an existing SCEV if it exists, otherwise analyze the expression and /// create a new one. const SCEV *ScalarEvolution::getSCEV(Value *V) { assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); if (const SCEV *S = getExistingSCEV(V)) return S; const SCEV *S = createSCEVIter(V); assert((!isa(V) || !isAlwaysUnknown(cast(V)) || isa(S)) && "isAlwaysUnknown() instruction is not SCEVUnknown"); return S; } const SCEV *ScalarEvolution::getExistingSCEV(Value *V) { assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); ValueExprMapType::iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) { const SCEV *S = I->second; assert(checkValidity(S) && "existing SCEV has not been properly invalidated"); return S; } return nullptr; } /// Return a SCEV corresponding to -V = -1*V const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags) { if (const SCEVConstant *VC = dyn_cast(V)) return getConstant( cast(ConstantExpr::getNeg(VC->getValue()))); Type *Ty = V->getType(); Ty = getEffectiveSCEVType(Ty); return getMulExpr(V, getMinusOne(Ty), Flags); } /// If Expr computes ~A, return A else return nullptr static const SCEV *MatchNotExpr(const SCEV *Expr) { const SCEVAddExpr *Add = dyn_cast(Expr); if (!Add || Add->getNumOperands() != 2 || !Add->getOperand(0)->isAllOnesValue()) return nullptr; const SCEVMulExpr *AddRHS = dyn_cast(Add->getOperand(1)); if (!AddRHS || AddRHS->getNumOperands() != 2 || !AddRHS->getOperand(0)->isAllOnesValue()) return nullptr; return AddRHS->getOperand(1); } /// Return a SCEV corresponding to ~V = -1-V const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) { assert(!V->getType()->isPointerTy() && "Can't negate pointer"); if (const SCEVConstant *VC = dyn_cast(V)) return getConstant( cast(ConstantExpr::getNot(VC->getValue()))); // Fold ~(u|s)(min|max)(~x, ~y) to (u|s)(max|min)(x, y) if (const SCEVMinMaxExpr *MME = dyn_cast(V)) { auto MatchMinMaxNegation = [&](const SCEVMinMaxExpr *MME) { SmallVector MatchedOperands; for (const SCEV *Operand : MME->operands()) { const SCEV *Matched = MatchNotExpr(Operand); if (!Matched) return (const SCEV *)nullptr; MatchedOperands.push_back(Matched); } return getMinMaxExpr(SCEVMinMaxExpr::negate(MME->getSCEVType()), MatchedOperands); }; if (const SCEV *Replaced = MatchMinMaxNegation(MME)) return Replaced; } Type *Ty = V->getType(); Ty = getEffectiveSCEVType(Ty); return getMinusSCEV(getMinusOne(Ty), V); } const SCEV *ScalarEvolution::removePointerBase(const SCEV *P) { assert(P->getType()->isPointerTy()); if (auto *AddRec = dyn_cast(P)) { // The base of an AddRec is the first operand. SmallVector Ops{AddRec->operands()}; Ops[0] = removePointerBase(Ops[0]); // Don't try to transfer nowrap flags for now. We could in some cases // (for example, if pointer operand of the AddRec is a SCEVUnknown). return getAddRecExpr(Ops, AddRec->getLoop(), SCEV::FlagAnyWrap); } if (auto *Add = dyn_cast(P)) { // The base of an Add is the pointer operand. SmallVector Ops{Add->operands()}; const SCEV **PtrOp = nullptr; for (const SCEV *&AddOp : Ops) { if (AddOp->getType()->isPointerTy()) { assert(!PtrOp && "Cannot have multiple pointer ops"); PtrOp = &AddOp; } } *PtrOp = removePointerBase(*PtrOp); // Don't try to transfer nowrap flags for now. We could in some cases // (for example, if the pointer operand of the Add is a SCEVUnknown). return getAddExpr(Ops); } // Any other expression must be a pointer base. return getZero(P->getType()); } const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags, unsigned Depth) { // Fast path: X - X --> 0. if (LHS == RHS) return getZero(LHS->getType()); // If we subtract two pointers with different pointer bases, bail. // Eventually, we're going to add an assertion to getMulExpr that we // can't multiply by a pointer. if (RHS->getType()->isPointerTy()) { if (!LHS->getType()->isPointerTy() || getPointerBase(LHS) != getPointerBase(RHS)) return getCouldNotCompute(); LHS = removePointerBase(LHS); RHS = removePointerBase(RHS); } // We represent LHS - RHS as LHS + (-1)*RHS. This transformation // makes it so that we cannot make much use of NUW. auto AddFlags = SCEV::FlagAnyWrap; const bool RHSIsNotMinSigned = !getSignedRangeMin(RHS).isMinSignedValue(); if (hasFlags(Flags, SCEV::FlagNSW)) { // Let M be the minimum representable signed value. Then (-1)*RHS // signed-wraps if and only if RHS is M. That can happen even for // a NSW subtraction because e.g. (-1)*M signed-wraps even though // -1 - M does not. So to transfer NSW from LHS - RHS to LHS + // (-1)*RHS, we need to prove that RHS != M. // // If LHS is non-negative and we know that LHS - RHS does not // signed-wrap, then RHS cannot be M. So we can rule out signed-wrap // either by proving that RHS > M or that LHS >= 0. if (RHSIsNotMinSigned || isKnownNonNegative(LHS)) { AddFlags = SCEV::FlagNSW; } } // FIXME: Find a correct way to transfer NSW to (-1)*M when LHS - // RHS is NSW and LHS >= 0. // // The difficulty here is that the NSW flag may have been proven // relative to a loop that is to be found in a recurrence in LHS and // not in RHS. Applying NSW to (-1)*M may then let the NSW have a // larger scope than intended. auto NegFlags = RHSIsNotMinSigned ? SCEV::FlagNSW : SCEV::FlagAnyWrap; return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags, Depth); } const SCEV *ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth) { Type *SrcTy = V->getType(); assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot truncate or zero extend with non-integer arguments!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty)) return getTruncateExpr(V, Ty, Depth); return getZeroExtendExpr(V, Ty, Depth); } const SCEV *ScalarEvolution::getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth) { Type *SrcTy = V->getType(); assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot truncate or zero extend with non-integer arguments!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty)) return getTruncateExpr(V, Ty, Depth); return getSignExtendExpr(V, Ty, Depth); } const SCEV * ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot noop or zero extend with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) && "getNoopOrZeroExtend cannot truncate!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getZeroExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot noop or sign extend with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) && "getNoopOrSignExtend cannot truncate!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getSignExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot noop or any extend with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) && "getNoopOrAnyExtend cannot truncate!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getAnyExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() && "Cannot truncate or noop with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) >= getTypeSizeInBits(Ty) && "getTruncateOrNoop cannot extend!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getTruncateExpr(V, Ty); } const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS, const SCEV *RHS) { const SCEV *PromotedLHS = LHS; const SCEV *PromotedRHS = RHS; if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType())) PromotedRHS = getZeroExtendExpr(RHS, LHS->getType()); else PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType()); return getUMaxExpr(PromotedLHS, PromotedRHS); } const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS, const SCEV *RHS, bool Sequential) { SmallVector Ops = { LHS, RHS }; return getUMinFromMismatchedTypes(Ops, Sequential); } const SCEV * ScalarEvolution::getUMinFromMismatchedTypes(SmallVectorImpl &Ops, bool Sequential) { assert(!Ops.empty() && "At least one operand must be!"); // Trivial case. if (Ops.size() == 1) return Ops[0]; // Find the max type first. Type *MaxType = nullptr; for (const auto *S : Ops) if (MaxType) MaxType = getWiderType(MaxType, S->getType()); else MaxType = S->getType(); assert(MaxType && "Failed to find maximum type!"); // Extend all ops to max type. SmallVector PromotedOps; for (const auto *S : Ops) PromotedOps.push_back(getNoopOrZeroExtend(S, MaxType)); // Generate umin. return getUMinExpr(PromotedOps, Sequential); } const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) { // A pointer operand may evaluate to a nonpointer expression, such as null. if (!V->getType()->isPointerTy()) return V; while (true) { if (auto *AddRec = dyn_cast(V)) { V = AddRec->getStart(); } else if (auto *Add = dyn_cast(V)) { const SCEV *PtrOp = nullptr; for (const SCEV *AddOp : Add->operands()) { if (AddOp->getType()->isPointerTy()) { assert(!PtrOp && "Cannot have multiple pointer ops"); PtrOp = AddOp; } } assert(PtrOp && "Must have pointer op"); V = PtrOp; } else // Not something we can look further into. return V; } } /// Push users of the given Instruction onto the given Worklist. static void PushDefUseChildren(Instruction *I, SmallVectorImpl &Worklist, SmallPtrSetImpl &Visited) { // Push the def-use children onto the Worklist stack. for (User *U : I->users()) { auto *UserInsn = cast(U); if (isAlwaysUnknown(UserInsn)) continue; if (Visited.insert(UserInsn).second) Worklist.push_back(UserInsn); } } namespace { /// Takes SCEV S and Loop L. For each AddRec sub-expression, use its start /// expression in case its Loop is L. If it is not L then /// if IgnoreOtherLoops is true then use AddRec itself /// otherwise rewrite cannot be done. /// If SCEV contains non-invariant unknown SCEV rewrite cannot be done. class SCEVInitRewriter : public SCEVRewriteVisitor { public: static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE, bool IgnoreOtherLoops = true) { SCEVInitRewriter Rewriter(L, SE); const SCEV *Result = Rewriter.visit(S); if (Rewriter.hasSeenLoopVariantSCEVUnknown()) return SE.getCouldNotCompute(); return Rewriter.hasSeenOtherLoops() && !IgnoreOtherLoops ? SE.getCouldNotCompute() : Result; } const SCEV *visitUnknown(const SCEVUnknown *Expr) { if (!SE.isLoopInvariant(Expr, L)) SeenLoopVariantSCEVUnknown = true; return Expr; } const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { // Only re-write AddRecExprs for this loop. if (Expr->getLoop() == L) return Expr->getStart(); SeenOtherLoops = true; return Expr; } bool hasSeenLoopVariantSCEVUnknown() { return SeenLoopVariantSCEVUnknown; } bool hasSeenOtherLoops() { return SeenOtherLoops; } private: explicit SCEVInitRewriter(const Loop *L, ScalarEvolution &SE) : SCEVRewriteVisitor(SE), L(L) {} const Loop *L; bool SeenLoopVariantSCEVUnknown = false; bool SeenOtherLoops = false; }; /// Takes SCEV S and Loop L. For each AddRec sub-expression, use its post /// increment expression in case its Loop is L. If it is not L then /// use AddRec itself. /// If SCEV contains non-invariant unknown SCEV rewrite cannot be done. class SCEVPostIncRewriter : public SCEVRewriteVisitor { public: static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE) { SCEVPostIncRewriter Rewriter(L, SE); const SCEV *Result = Rewriter.visit(S); return Rewriter.hasSeenLoopVariantSCEVUnknown() ? SE.getCouldNotCompute() : Result; } const SCEV *visitUnknown(const SCEVUnknown *Expr) { if (!SE.isLoopInvariant(Expr, L)) SeenLoopVariantSCEVUnknown = true; return Expr; } const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { // Only re-write AddRecExprs for this loop. if (Expr->getLoop() == L) return Expr->getPostIncExpr(SE); SeenOtherLoops = true; return Expr; } bool hasSeenLoopVariantSCEVUnknown() { return SeenLoopVariantSCEVUnknown; } bool hasSeenOtherLoops() { return SeenOtherLoops; } private: explicit SCEVPostIncRewriter(const Loop *L, ScalarEvolution &SE) : SCEVRewriteVisitor(SE), L(L) {} const Loop *L; bool SeenLoopVariantSCEVUnknown = false; bool SeenOtherLoops = false; }; /// This class evaluates the compare condition by matching it against the /// condition of loop latch. If there is a match we assume a true value /// for the condition while building SCEV nodes. class SCEVBackedgeConditionFolder : public SCEVRewriteVisitor { public: static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE) { bool IsPosBECond = false; Value *BECond = nullptr; if (BasicBlock *Latch = L->getLoopLatch()) { BranchInst *BI = dyn_cast(Latch->getTerminator()); if (BI && BI->isConditional()) { assert(BI->getSuccessor(0) != BI->getSuccessor(1) && "Both outgoing branches should not target same header!"); BECond = BI->getCondition(); IsPosBECond = BI->getSuccessor(0) == L->getHeader(); } else { return S; } } SCEVBackedgeConditionFolder Rewriter(L, BECond, IsPosBECond, SE); return Rewriter.visit(S); } const SCEV *visitUnknown(const SCEVUnknown *Expr) { const SCEV *Result = Expr; bool InvariantF = SE.isLoopInvariant(Expr, L); if (!InvariantF) { Instruction *I = cast(Expr->getValue()); switch (I->getOpcode()) { case Instruction::Select: { SelectInst *SI = cast(I); std::optional Res = compareWithBackedgeCondition(SI->getCondition()); if (Res) { bool IsOne = cast(*Res)->getValue()->isOne(); Result = SE.getSCEV(IsOne ? SI->getTrueValue() : SI->getFalseValue()); } break; } default: { std::optional Res = compareWithBackedgeCondition(I); if (Res) Result = *Res; break; } } } return Result; } private: explicit SCEVBackedgeConditionFolder(const Loop *L, Value *BECond, bool IsPosBECond, ScalarEvolution &SE) : SCEVRewriteVisitor(SE), L(L), BackedgeCond(BECond), IsPositiveBECond(IsPosBECond) {} std::optional compareWithBackedgeCondition(Value *IC); const Loop *L; /// Loop back condition. Value *BackedgeCond = nullptr; /// Set to true if loop back is on positive branch condition. bool IsPositiveBECond; }; std::optional SCEVBackedgeConditionFolder::compareWithBackedgeCondition(Value *IC) { // If value matches the backedge condition for loop latch, // then return a constant evolution node based on loopback // branch taken. if (BackedgeCond == IC) return IsPositiveBECond ? SE.getOne(Type::getInt1Ty(SE.getContext())) : SE.getZero(Type::getInt1Ty(SE.getContext())); return std::nullopt; } class SCEVShiftRewriter : public SCEVRewriteVisitor { public: static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE) { SCEVShiftRewriter Rewriter(L, SE); const SCEV *Result = Rewriter.visit(S); return Rewriter.isValid() ? Result : SE.getCouldNotCompute(); } const SCEV *visitUnknown(const SCEVUnknown *Expr) { // Only allow AddRecExprs for this loop. if (!SE.isLoopInvariant(Expr, L)) Valid = false; return Expr; } const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { if (Expr->getLoop() == L && Expr->isAffine()) return SE.getMinusSCEV(Expr, Expr->getStepRecurrence(SE)); Valid = false; return Expr; } bool isValid() { return Valid; } private: explicit SCEVShiftRewriter(const Loop *L, ScalarEvolution &SE) : SCEVRewriteVisitor(SE), L(L) {} const Loop *L; bool Valid = true; }; } // end anonymous namespace SCEV::NoWrapFlags ScalarEvolution::proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR) { if (!AR->isAffine()) return SCEV::FlagAnyWrap; using OBO = OverflowingBinaryOperator; SCEV::NoWrapFlags Result = SCEV::FlagAnyWrap; if (!AR->hasNoSelfWrap()) { const SCEV *BECount = getConstantMaxBackedgeTakenCount(AR->getLoop()); if (const SCEVConstant *BECountMax = dyn_cast(BECount)) { ConstantRange StepCR = getSignedRange(AR->getStepRecurrence(*this)); const APInt &BECountAP = BECountMax->getAPInt(); unsigned NoOverflowBitWidth = BECountAP.getActiveBits() + StepCR.getMinSignedBits(); if (NoOverflowBitWidth <= getTypeSizeInBits(AR->getType())) Result = ScalarEvolution::setFlags(Result, SCEV::FlagNW); } } if (!AR->hasNoSignedWrap()) { ConstantRange AddRecRange = getSignedRange(AR); ConstantRange IncRange = getSignedRange(AR->getStepRecurrence(*this)); auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, IncRange, OBO::NoSignedWrap); if (NSWRegion.contains(AddRecRange)) Result = ScalarEvolution::setFlags(Result, SCEV::FlagNSW); } if (!AR->hasNoUnsignedWrap()) { ConstantRange AddRecRange = getUnsignedRange(AR); ConstantRange IncRange = getUnsignedRange(AR->getStepRecurrence(*this)); auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, IncRange, OBO::NoUnsignedWrap); if (NUWRegion.contains(AddRecRange)) Result = ScalarEvolution::setFlags(Result, SCEV::FlagNUW); } return Result; } SCEV::NoWrapFlags ScalarEvolution::proveNoSignedWrapViaInduction(const SCEVAddRecExpr *AR) { SCEV::NoWrapFlags Result = AR->getNoWrapFlags(); if (AR->hasNoSignedWrap()) return Result; if (!AR->isAffine()) return Result; // This function can be expensive, only try to prove NSW once per AddRec. if (!SignedWrapViaInductionTried.insert(AR).second) return Result; const SCEV *Step = AR->getStepRecurrence(*this); const Loop *L = AR->getLoop(); // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are // simply not analyzable, and it covers the case where this code is // being called from within backedge-taken count analysis, such that // attempting to ask for the backedge-taken count would likely result // in infinite recursion. In the later case, the analysis code will // cope with a conservative value, and it will take care to purge // that value once it has finished. const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L); // Normally, in the cases we can prove no-overflow via a // backedge guarding condition, we can also compute a backedge // taken count for the loop. The exceptions are assumptions and // guards present in the loop -- SCEV is not great at exploiting // these to compute max backedge taken counts, but can still use // these to prove lack of overflow. Use this fact to avoid // doing extra work that may not pay off. if (isa(MaxBECount) && !HasGuards && AC.assumptions().empty()) return Result; // If the backedge is guarded by a comparison with the pre-inc value the // addrec is safe. Also, if the entry is guarded by a comparison with the // start value and the backedge is guarded by a comparison with the post-inc // value, the addrec is safe. ICmpInst::Predicate Pred; const SCEV *OverflowLimit = getSignedOverflowLimitForStep(Step, &Pred, this); if (OverflowLimit && (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) || isKnownOnEveryIteration(Pred, AR, OverflowLimit))) { Result = setFlags(Result, SCEV::FlagNSW); } return Result; } SCEV::NoWrapFlags ScalarEvolution::proveNoUnsignedWrapViaInduction(const SCEVAddRecExpr *AR) { SCEV::NoWrapFlags Result = AR->getNoWrapFlags(); if (AR->hasNoUnsignedWrap()) return Result; if (!AR->isAffine()) return Result; // This function can be expensive, only try to prove NUW once per AddRec. if (!UnsignedWrapViaInductionTried.insert(AR).second) return Result; const SCEV *Step = AR->getStepRecurrence(*this); unsigned BitWidth = getTypeSizeInBits(AR->getType()); const Loop *L = AR->getLoop(); // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are // simply not analyzable, and it covers the case where this code is // being called from within backedge-taken count analysis, such that // attempting to ask for the backedge-taken count would likely result // in infinite recursion. In the later case, the analysis code will // cope with a conservative value, and it will take care to purge // that value once it has finished. const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L); // Normally, in the cases we can prove no-overflow via a // backedge guarding condition, we can also compute a backedge // taken count for the loop. The exceptions are assumptions and // guards present in the loop -- SCEV is not great at exploiting // these to compute max backedge taken counts, but can still use // these to prove lack of overflow. Use this fact to avoid // doing extra work that may not pay off. if (isa(MaxBECount) && !HasGuards && AC.assumptions().empty()) return Result; // If the backedge is guarded by a comparison with the pre-inc value the // addrec is safe. Also, if the entry is guarded by a comparison with the // start value and the backedge is guarded by a comparison with the post-inc // value, the addrec is safe. if (isKnownPositive(Step)) { const SCEV *N = getConstant(APInt::getMinValue(BitWidth) - getUnsignedRangeMax(Step)); if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) || isKnownOnEveryIteration(ICmpInst::ICMP_ULT, AR, N)) { Result = setFlags(Result, SCEV::FlagNUW); } } return Result; } namespace { /// Represents an abstract binary operation. This may exist as a /// normal instruction or constant expression, or may have been /// derived from an expression tree. struct BinaryOp { unsigned Opcode; Value *LHS; Value *RHS; bool IsNSW = false; bool IsNUW = false; /// Op is set if this BinaryOp corresponds to a concrete LLVM instruction or /// constant expression. Operator *Op = nullptr; explicit BinaryOp(Operator *Op) : Opcode(Op->getOpcode()), LHS(Op->getOperand(0)), RHS(Op->getOperand(1)), Op(Op) { if (auto *OBO = dyn_cast(Op)) { IsNSW = OBO->hasNoSignedWrap(); IsNUW = OBO->hasNoUnsignedWrap(); } } explicit BinaryOp(unsigned Opcode, Value *LHS, Value *RHS, bool IsNSW = false, bool IsNUW = false) : Opcode(Opcode), LHS(LHS), RHS(RHS), IsNSW(IsNSW), IsNUW(IsNUW) {} }; } // end anonymous namespace /// Try to map \p V into a BinaryOp, and return \c std::nullopt on failure. static std::optional MatchBinaryOp(Value *V, const DataLayout &DL, AssumptionCache &AC, const DominatorTree &DT, const Instruction *CxtI) { auto *Op = dyn_cast(V); if (!Op) return std::nullopt; // Implementation detail: all the cleverness here should happen without // creating new SCEV expressions -- our caller knowns tricks to avoid creating // SCEV expressions when possible, and we should not break that. switch (Op->getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Mul: case Instruction::UDiv: case Instruction::URem: case Instruction::And: case Instruction::AShr: case Instruction::Shl: return BinaryOp(Op); case Instruction::Or: { // LLVM loves to convert `add` of operands with no common bits // into an `or`. But SCEV really doesn't deal with `or` that well, // so try extra hard to recognize this `or` as an `add`. if (haveNoCommonBitsSet(Op->getOperand(0), Op->getOperand(1), DL, &AC, CxtI, &DT, /*UseInstrInfo=*/true)) return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1), /*IsNSW=*/true, /*IsNUW=*/true); return BinaryOp(Op); } case Instruction::Xor: if (auto *RHSC = dyn_cast(Op->getOperand(1))) // If the RHS of the xor is a signmask, then this is just an add. // Instcombine turns add of signmask into xor as a strength reduction step. if (RHSC->getValue().isSignMask()) return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1)); // Binary `xor` is a bit-wise `add`. if (V->getType()->isIntegerTy(1)) return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1)); return BinaryOp(Op); case Instruction::LShr: // Turn logical shift right of a constant into a unsigned divide. if (ConstantInt *SA = dyn_cast(Op->getOperand(1))) { uint32_t BitWidth = cast(Op->getType())->getBitWidth(); // If the shift count is not less than the bitwidth, the result of // the shift is undefined. Don't try to analyze it, because the // resolution chosen here may differ from the resolution chosen in // other parts of the compiler. if (SA->getValue().ult(BitWidth)) { Constant *X = ConstantInt::get(SA->getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue())); return BinaryOp(Instruction::UDiv, Op->getOperand(0), X); } } return BinaryOp(Op); case Instruction::ExtractValue: { auto *EVI = cast(Op); if (EVI->getNumIndices() != 1 || EVI->getIndices()[0] != 0) break; auto *WO = dyn_cast(EVI->getAggregateOperand()); if (!WO) break; Instruction::BinaryOps BinOp = WO->getBinaryOp(); bool Signed = WO->isSigned(); // TODO: Should add nuw/nsw flags for mul as well. if (BinOp == Instruction::Mul || !isOverflowIntrinsicNoWrap(WO, DT)) return BinaryOp(BinOp, WO->getLHS(), WO->getRHS()); // Now that we know that all uses of the arithmetic-result component of // CI are guarded by the overflow check, we can go ahead and pretend // that the arithmetic is non-overflowing. return BinaryOp(BinOp, WO->getLHS(), WO->getRHS(), /* IsNSW = */ Signed, /* IsNUW = */ !Signed); } default: break; } // Recognise intrinsic loop.decrement.reg, and as this has exactly the same // semantics as a Sub, return a binary sub expression. if (auto *II = dyn_cast(V)) if (II->getIntrinsicID() == Intrinsic::loop_decrement_reg) return BinaryOp(Instruction::Sub, II->getOperand(0), II->getOperand(1)); return std::nullopt; } /// Helper function to createAddRecFromPHIWithCasts. We have a phi /// node whose symbolic (unknown) SCEV is \p SymbolicPHI, which is updated via /// the loop backedge by a SCEVAddExpr, possibly also with a few casts on the /// way. This function checks if \p Op, an operand of this SCEVAddExpr, /// follows one of the following patterns: /// Op == (SExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) /// Op == (ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) /// If the SCEV expression of \p Op conforms with one of the expected patterns /// we return the type of the truncation operation, and indicate whether the /// truncated type should be treated as signed/unsigned by setting /// \p Signed to true/false, respectively. static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI, bool &Signed, ScalarEvolution &SE) { // The case where Op == SymbolicPHI (that is, with no type conversions on // the way) is handled by the regular add recurrence creating logic and // would have already been triggered in createAddRecForPHI. Reaching it here // means that createAddRecFromPHI had failed for this PHI before (e.g., // because one of the other operands of the SCEVAddExpr updating this PHI is // not invariant). // // Here we look for the case where Op = (ext(trunc(SymbolicPHI))), and in // this case predicates that allow us to prove that Op == SymbolicPHI will // be added. if (Op == SymbolicPHI) return nullptr; unsigned SourceBits = SE.getTypeSizeInBits(SymbolicPHI->getType()); unsigned NewBits = SE.getTypeSizeInBits(Op->getType()); if (SourceBits != NewBits) return nullptr; const SCEVSignExtendExpr *SExt = dyn_cast(Op); const SCEVZeroExtendExpr *ZExt = dyn_cast(Op); if (!SExt && !ZExt) return nullptr; const SCEVTruncateExpr *Trunc = SExt ? dyn_cast(SExt->getOperand()) : dyn_cast(ZExt->getOperand()); if (!Trunc) return nullptr; const SCEV *X = Trunc->getOperand(); if (X != SymbolicPHI) return nullptr; Signed = SExt != nullptr; return Trunc->getType(); } static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) { if (!PN->getType()->isIntegerTy()) return nullptr; const Loop *L = LI.getLoopFor(PN->getParent()); if (!L || L->getHeader() != PN->getParent()) return nullptr; return L; } // Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the // computation that updates the phi follows the following pattern: // (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum // which correspond to a phi->trunc->sext/zext->add->phi update chain. // If so, try to see if it can be rewritten as an AddRecExpr under some // Predicates. If successful, return them as a pair. Also cache the results // of the analysis. // // Example usage scenario: // Say the Rewriter is called for the following SCEV: // 8 * ((sext i32 (trunc i64 %X to i32) to i64) + %Step) // where: // %X = phi i64 (%Start, %BEValue) // It will visitMul->visitAdd->visitSExt->visitTrunc->visitUnknown(%X), // and call this function with %SymbolicPHI = %X. // // The analysis will find that the value coming around the backedge has // the following SCEV: // BEValue = ((sext i32 (trunc i64 %X to i32) to i64) + %Step) // Upon concluding that this matches the desired pattern, the function // will return the pair {NewAddRec, SmallPredsVec} where: // NewAddRec = {%Start,+,%Step} // SmallPredsVec = {P1, P2, P3} as follows: // P1(WrapPred): AR: {trunc(%Start),+,(trunc %Step)} Flags: // P2(EqualPred): %Start == (sext i32 (trunc i64 %Start to i32) to i64) // P3(EqualPred): %Step == (sext i32 (trunc i64 %Step to i32) to i64) // The returned pair means that SymbolicPHI can be rewritten into NewAddRec // under the predicates {P1,P2,P3}. // This predicated rewrite will be cached in PredicatedSCEVRewrites: // PredicatedSCEVRewrites[{%X,L}] = {NewAddRec, {P1,P2,P3)} // // TODO's: // // 1) Extend the Induction descriptor to also support inductions that involve // casts: When needed (namely, when we are called in the context of the // vectorizer induction analysis), a Set of cast instructions will be // populated by this method, and provided back to isInductionPHI. This is // needed to allow the vectorizer to properly record them to be ignored by // the cost model and to avoid vectorizing them (otherwise these casts, // which are redundant under the runtime overflow checks, will be // vectorized, which can be costly). // // 2) Support additional induction/PHISCEV patterns: We also want to support // inductions where the sext-trunc / zext-trunc operations (partly) occur // after the induction update operation (the induction increment): // // (Trunc iy (SExt/ZExt ix (%SymbolicPHI + InvariantAccum) to iy) to ix) // which correspond to a phi->add->trunc->sext/zext->phi update chain. // // (Trunc iy ((SExt/ZExt ix (%SymbolicPhi) to iy) + InvariantAccum) to ix) // which correspond to a phi->trunc->add->sext/zext->phi update chain. // // 3) Outline common code with createAddRecFromPHI to avoid duplication. std::optional>> ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI) { SmallVector Predicates; // *** Part1: Analyze if we have a phi-with-cast pattern for which we can // return an AddRec expression under some predicate. auto *PN = cast(SymbolicPHI->getValue()); const Loop *L = isIntegerLoopHeaderPHI(PN, LI); assert(L && "Expecting an integer loop header phi"); // The loop may have multiple entrances or multiple exits; we can analyze // this phi as an addrec if it has a unique entry value and a unique // backedge value. Value *BEValueV = nullptr, *StartValueV = nullptr; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); if (L->contains(PN->getIncomingBlock(i))) { if (!BEValueV) { BEValueV = V; } else if (BEValueV != V) { BEValueV = nullptr; break; } } else if (!StartValueV) { StartValueV = V; } else if (StartValueV != V) { StartValueV = nullptr; break; } } if (!BEValueV || !StartValueV) return std::nullopt; const SCEV *BEValue = getSCEV(BEValueV); // If the value coming around the backedge is an add with the symbolic // value we just inserted, possibly with casts that we can ignore under // an appropriate runtime guard, then we found a simple induction variable! const auto *Add = dyn_cast(BEValue); if (!Add) return std::nullopt; // If there is a single occurrence of the symbolic value, possibly // casted, replace it with a recurrence. unsigned FoundIndex = Add->getNumOperands(); Type *TruncTy = nullptr; bool Signed; for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) if ((TruncTy = isSimpleCastedPHI(Add->getOperand(i), SymbolicPHI, Signed, *this))) if (FoundIndex == e) { FoundIndex = i; break; } if (FoundIndex == Add->getNumOperands()) return std::nullopt; // Create an add with everything but the specified operand. SmallVector Ops; for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) if (i != FoundIndex) Ops.push_back(Add->getOperand(i)); const SCEV *Accum = getAddExpr(Ops); // The runtime checks will not be valid if the step amount is // varying inside the loop. if (!isLoopInvariant(Accum, L)) return std::nullopt; // *** Part2: Create the predicates // Analysis was successful: we have a phi-with-cast pattern for which we // can return an AddRec expression under the following predicates: // // P1: A Wrap predicate that guarantees that Trunc(Start) + i*Trunc(Accum) // fits within the truncated type (does not overflow) for i = 0 to n-1. // P2: An Equal predicate that guarantees that // Start = (Ext ix (Trunc iy (Start) to ix) to iy) // P3: An Equal predicate that guarantees that // Accum = (Ext ix (Trunc iy (Accum) to ix) to iy) // // As we next prove, the above predicates guarantee that: // Start + i*Accum = (Ext ix (Trunc iy ( Start + i*Accum ) to ix) to iy) // // // More formally, we want to prove that: // Expr(i+1) = Start + (i+1) * Accum // = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum // // Given that: // 1) Expr(0) = Start // 2) Expr(1) = Start + Accum // = (Ext ix (Trunc iy (Start) to ix) to iy) + Accum :: from P2 // 3) Induction hypothesis (step i): // Expr(i) = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum // // Proof: // Expr(i+1) = // = Start + (i+1)*Accum // = (Start + i*Accum) + Accum // = Expr(i) + Accum // = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + Accum // :: from step i // // = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + Accum + Accum // // = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) // + (Ext ix (Trunc iy (Accum) to ix) to iy) // + Accum :: from P3 // // = (Ext ix (Trunc iy ((Start + (i-1)*Accum) + Accum) to ix) to iy) // + Accum :: from P1: Ext(x)+Ext(y)=>Ext(x+y) // // = (Ext ix (Trunc iy (Start + i*Accum) to ix) to iy) + Accum // = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum // // By induction, the same applies to all iterations 1<=i(PHISCEV)) { SCEVWrapPredicate::IncrementWrapFlags AddedFlags = Signed ? SCEVWrapPredicate::IncrementNSSW : SCEVWrapPredicate::IncrementNUSW; const SCEVPredicate *AddRecPred = getWrapPredicate(AR, AddedFlags); Predicates.push_back(AddRecPred); } // Create the Equal Predicates P2,P3: // It is possible that the predicates P2 and/or P3 are computable at // compile time due to StartVal and/or Accum being constants. // If either one is, then we can check that now and escape if either P2 // or P3 is false. // Construct the extended SCEV: (Ext ix (Trunc iy (Expr) to ix) to iy) // for each of StartVal and Accum auto getExtendedExpr = [&](const SCEV *Expr, bool CreateSignExtend) -> const SCEV * { assert(isLoopInvariant(Expr, L) && "Expr is expected to be invariant"); const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy); const SCEV *ExtendedExpr = CreateSignExtend ? getSignExtendExpr(TruncatedExpr, Expr->getType()) : getZeroExtendExpr(TruncatedExpr, Expr->getType()); return ExtendedExpr; }; // Given: // ExtendedExpr = (Ext ix (Trunc iy (Expr) to ix) to iy // = getExtendedExpr(Expr) // Determine whether the predicate P: Expr == ExtendedExpr // is known to be false at compile time auto PredIsKnownFalse = [&](const SCEV *Expr, const SCEV *ExtendedExpr) -> bool { return Expr != ExtendedExpr && isKnownPredicate(ICmpInst::ICMP_NE, Expr, ExtendedExpr); }; const SCEV *StartExtended = getExtendedExpr(StartVal, Signed); if (PredIsKnownFalse(StartVal, StartExtended)) { LLVM_DEBUG(dbgs() << "P2 is compile-time false\n";); return std::nullopt; } // The Step is always Signed (because the overflow checks are either // NSSW or NUSW) const SCEV *AccumExtended = getExtendedExpr(Accum, /*CreateSignExtend=*/true); if (PredIsKnownFalse(Accum, AccumExtended)) { LLVM_DEBUG(dbgs() << "P3 is compile-time false\n";); return std::nullopt; } auto AppendPredicate = [&](const SCEV *Expr, const SCEV *ExtendedExpr) -> void { if (Expr != ExtendedExpr && !isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) { const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr); LLVM_DEBUG(dbgs() << "Added Predicate: " << *Pred); Predicates.push_back(Pred); } }; AppendPredicate(StartVal, StartExtended); AppendPredicate(Accum, AccumExtended); // *** Part3: Predicates are ready. Now go ahead and create the new addrec in // which the casts had been folded away. The caller can rewrite SymbolicPHI // into NewAR if it will also add the runtime overflow checks specified in // Predicates. auto *NewAR = getAddRecExpr(StartVal, Accum, L, SCEV::FlagAnyWrap); std::pair> PredRewrite = std::make_pair(NewAR, Predicates); // Remember the result of the analysis for this SCEV at this locayyytion. PredicatedSCEVRewrites[{SymbolicPHI, L}] = PredRewrite; return PredRewrite; } std::optional>> ScalarEvolution::createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI) { auto *PN = cast(SymbolicPHI->getValue()); const Loop *L = isIntegerLoopHeaderPHI(PN, LI); if (!L) return std::nullopt; // Check to see if we already analyzed this PHI. auto I = PredicatedSCEVRewrites.find({SymbolicPHI, L}); if (I != PredicatedSCEVRewrites.end()) { std::pair> Rewrite = I->second; // Analysis was done before and failed to create an AddRec: if (Rewrite.first == SymbolicPHI) return std::nullopt; // Analysis was done before and succeeded to create an AddRec under // a predicate: assert(isa(Rewrite.first) && "Expected an AddRec"); assert(!(Rewrite.second).empty() && "Expected to find Predicates"); return Rewrite; } std::optional>> Rewrite = createAddRecFromPHIWithCastsImpl(SymbolicPHI); // Record in the cache that the analysis failed if (!Rewrite) { SmallVector Predicates; PredicatedSCEVRewrites[{SymbolicPHI, L}] = {SymbolicPHI, Predicates}; return std::nullopt; } return Rewrite; } // FIXME: This utility is currently required because the Rewriter currently // does not rewrite this expression: // {0, +, (sext ix (trunc iy to ix) to iy)} // into {0, +, %step}, // even when the following Equal predicate exists: // "%step == (sext ix (trunc iy to ix) to iy)". bool PredicatedScalarEvolution::areAddRecsEqualWithPreds( const SCEVAddRecExpr *AR1, const SCEVAddRecExpr *AR2) const { if (AR1 == AR2) return true; auto areExprsEqual = [&](const SCEV *Expr1, const SCEV *Expr2) -> bool { if (Expr1 != Expr2 && !Preds->implies(SE.getEqualPredicate(Expr1, Expr2)) && !Preds->implies(SE.getEqualPredicate(Expr2, Expr1))) return false; return true; }; if (!areExprsEqual(AR1->getStart(), AR2->getStart()) || !areExprsEqual(AR1->getStepRecurrence(SE), AR2->getStepRecurrence(SE))) return false; return true; } /// A helper function for createAddRecFromPHI to handle simple cases. /// /// This function tries to find an AddRec expression for the simplest (yet most /// common) cases: PN = PHI(Start, OP(Self, LoopInvariant)). /// If it fails, createAddRecFromPHI will use a more general, but slow, /// technique for finding the AddRec expression. const SCEV *ScalarEvolution::createSimpleAffineAddRec(PHINode *PN, Value *BEValueV, Value *StartValueV) { const Loop *L = LI.getLoopFor(PN->getParent()); assert(L && L->getHeader() == PN->getParent()); assert(BEValueV && StartValueV); auto BO = MatchBinaryOp(BEValueV, getDataLayout(), AC, DT, PN); if (!BO) return nullptr; if (BO->Opcode != Instruction::Add) return nullptr; const SCEV *Accum = nullptr; if (BO->LHS == PN && L->isLoopInvariant(BO->RHS)) Accum = getSCEV(BO->RHS); else if (BO->RHS == PN && L->isLoopInvariant(BO->LHS)) Accum = getSCEV(BO->LHS); if (!Accum) return nullptr; SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (BO->IsNUW) Flags = setFlags(Flags, SCEV::FlagNUW); if (BO->IsNSW) Flags = setFlags(Flags, SCEV::FlagNSW); const SCEV *StartVal = getSCEV(StartValueV); const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); insertValueToMap(PN, PHISCEV); if (auto *AR = dyn_cast(PHISCEV)) { setNoWrapFlags(const_cast(AR), (SCEV::NoWrapFlags)(AR->getNoWrapFlags() | proveNoWrapViaConstantRanges(AR))); } // We can add Flags to the post-inc expression only if we // know that it is *undefined behavior* for BEValueV to // overflow. if (auto *BEInst = dyn_cast(BEValueV)) { assert(isLoopInvariant(Accum, L) && "Accum is defined outside L, but is not invariant?"); if (isAddRecNeverPoison(BEInst, L)) (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags); } return PHISCEV; } const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) { const Loop *L = LI.getLoopFor(PN->getParent()); if (!L || L->getHeader() != PN->getParent()) return nullptr; // The loop may have multiple entrances or multiple exits; we can analyze // this phi as an addrec if it has a unique entry value and a unique // backedge value. Value *BEValueV = nullptr, *StartValueV = nullptr; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); if (L->contains(PN->getIncomingBlock(i))) { if (!BEValueV) { BEValueV = V; } else if (BEValueV != V) { BEValueV = nullptr; break; } } else if (!StartValueV) { StartValueV = V; } else if (StartValueV != V) { StartValueV = nullptr; break; } } if (!BEValueV || !StartValueV) return nullptr; assert(ValueExprMap.find_as(PN) == ValueExprMap.end() && "PHI node already processed?"); // First, try to find AddRec expression without creating a fictituos symbolic // value for PN. if (auto *S = createSimpleAffineAddRec(PN, BEValueV, StartValueV)) return S; // Handle PHI node value symbolically. const SCEV *SymbolicName = getUnknown(PN); insertValueToMap(PN, SymbolicName); // Using this symbolic name for the PHI, analyze the value coming around // the back-edge. const SCEV *BEValue = getSCEV(BEValueV); // NOTE: If BEValue is loop invariant, we know that the PHI node just // has a special value for the first iteration of the loop. // If the value coming around the backedge is an add with the symbolic // value we just inserted, then we found a simple induction variable! if (const SCEVAddExpr *Add = dyn_cast(BEValue)) { // If there is a single occurrence of the symbolic value, replace it // with a recurrence. unsigned FoundIndex = Add->getNumOperands(); for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) if (Add->getOperand(i) == SymbolicName) if (FoundIndex == e) { FoundIndex = i; break; } if (FoundIndex != Add->getNumOperands()) { // Create an add with everything but the specified operand. SmallVector Ops; for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) if (i != FoundIndex) Ops.push_back(SCEVBackedgeConditionFolder::rewrite(Add->getOperand(i), L, *this)); const SCEV *Accum = getAddExpr(Ops); // This is not a valid addrec if the step amount is varying each // loop iteration, but is not itself an addrec in this loop. if (isLoopInvariant(Accum, L) || (isa(Accum) && cast(Accum)->getLoop() == L)) { SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (auto BO = MatchBinaryOp(BEValueV, getDataLayout(), AC, DT, PN)) { if (BO->Opcode == Instruction::Add && BO->LHS == PN) { if (BO->IsNUW) Flags = setFlags(Flags, SCEV::FlagNUW); if (BO->IsNSW) Flags = setFlags(Flags, SCEV::FlagNSW); } } else if (GEPOperator *GEP = dyn_cast(BEValueV)) { // If the increment is an inbounds GEP, then we know the address // space cannot be wrapped around. We cannot make any guarantee // about signed or unsigned overflow because pointers are // unsigned but we may have a negative index from the base // pointer. We can guarantee that no unsigned wrap occurs if the // indices form a positive value. if (GEP->isInBounds() && GEP->getOperand(0) == PN) { Flags = setFlags(Flags, SCEV::FlagNW); if (isKnownPositive(Accum)) Flags = setFlags(Flags, SCEV::FlagNUW); } // We cannot transfer nuw and nsw flags from subtraction // operations -- sub nuw X, Y is not the same as add nuw X, -Y // for instance. } const SCEV *StartVal = getSCEV(StartValueV); const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. forgetMemoizedResults(SymbolicName); insertValueToMap(PN, PHISCEV); if (auto *AR = dyn_cast(PHISCEV)) { setNoWrapFlags(const_cast(AR), (SCEV::NoWrapFlags)(AR->getNoWrapFlags() | proveNoWrapViaConstantRanges(AR))); } // We can add Flags to the post-inc expression only if we // know that it is *undefined behavior* for BEValueV to // overflow. if (auto *BEInst = dyn_cast(BEValueV)) if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L)) (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags); return PHISCEV; } } } else { // Otherwise, this could be a loop like this: // i = 0; for (j = 1; ..; ++j) { .... i = j; } // In this case, j = {1,+,1} and BEValue is j. // Because the other in-value of i (0) fits the evolution of BEValue // i really is an addrec evolution. // // We can generalize this saying that i is the shifted value of BEValue // by one iteration: // PHI(f(0), f({1,+,1})) --> f({0,+,1}) const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this); const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this, false); if (Shifted != getCouldNotCompute() && Start != getCouldNotCompute()) { const SCEV *StartVal = getSCEV(StartValueV); if (Start == StartVal) { // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. forgetMemoizedResults(SymbolicName); insertValueToMap(PN, Shifted); return Shifted; } } } // Remove the temporary PHI node SCEV that has been inserted while intending // to create an AddRecExpr for this PHI node. We can not keep this temporary // as it will prevent later (possibly simpler) SCEV expressions to be added // to the ValueExprMap. eraseValueFromMap(PN); return nullptr; } // Try to match a control flow sequence that branches out at BI and merges back // at Merge into a "C ? LHS : RHS" select pattern. Return true on a successful // match. static bool BrPHIToSelect(DominatorTree &DT, BranchInst *BI, PHINode *Merge, Value *&C, Value *&LHS, Value *&RHS) { C = BI->getCondition(); BasicBlockEdge LeftEdge(BI->getParent(), BI->getSuccessor(0)); BasicBlockEdge RightEdge(BI->getParent(), BI->getSuccessor(1)); if (!LeftEdge.isSingleEdge()) return false; assert(RightEdge.isSingleEdge() && "Follows from LeftEdge.isSingleEdge()"); Use &LeftUse = Merge->getOperandUse(0); Use &RightUse = Merge->getOperandUse(1); if (DT.dominates(LeftEdge, LeftUse) && DT.dominates(RightEdge, RightUse)) { LHS = LeftUse; RHS = RightUse; return true; } if (DT.dominates(LeftEdge, RightUse) && DT.dominates(RightEdge, LeftUse)) { LHS = RightUse; RHS = LeftUse; return true; } return false; } const SCEV *ScalarEvolution::createNodeFromSelectLikePHI(PHINode *PN) { auto IsReachable = [&](BasicBlock *BB) { return DT.isReachableFromEntry(BB); }; if (PN->getNumIncomingValues() == 2 && all_of(PN->blocks(), IsReachable)) { // Try to match // // br %cond, label %left, label %right // left: // br label %merge // right: // br label %merge // merge: // V = phi [ %x, %left ], [ %y, %right ] // // as "select %cond, %x, %y" BasicBlock *IDom = DT[PN->getParent()]->getIDom()->getBlock(); assert(IDom && "At least the entry block should dominate PN"); auto *BI = dyn_cast(IDom->getTerminator()); Value *Cond = nullptr, *LHS = nullptr, *RHS = nullptr; if (BI && BI->isConditional() && BrPHIToSelect(DT, BI, PN, Cond, LHS, RHS) && properlyDominates(getSCEV(LHS), PN->getParent()) && properlyDominates(getSCEV(RHS), PN->getParent())) return createNodeForSelectOrPHI(PN, Cond, LHS, RHS); } return nullptr; } const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) { if (const SCEV *S = createAddRecFromPHI(PN)) return S; if (Value *V = simplifyInstruction(PN, {getDataLayout(), &TLI, &DT, &AC})) return getSCEV(V); if (const SCEV *S = createNodeFromSelectLikePHI(PN)) return S; // If it's not a loop phi, we can't handle it yet. return getUnknown(PN); } bool SCEVMinMaxExprContains(const SCEV *Root, const SCEV *OperandToFind, SCEVTypes RootKind) { struct FindClosure { const SCEV *OperandToFind; const SCEVTypes RootKind; // Must be a sequential min/max expression. const SCEVTypes NonSequentialRootKind; // Non-seq variant of RootKind. bool Found = false; bool canRecurseInto(SCEVTypes Kind) const { // We can only recurse into the SCEV expression of the same effective type // as the type of our root SCEV expression, and into zero-extensions. return RootKind == Kind || NonSequentialRootKind == Kind || scZeroExtend == Kind; }; FindClosure(const SCEV *OperandToFind, SCEVTypes RootKind) : OperandToFind(OperandToFind), RootKind(RootKind), NonSequentialRootKind( SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType( RootKind)) {} bool follow(const SCEV *S) { Found = S == OperandToFind; return !isDone() && canRecurseInto(S->getSCEVType()); } bool isDone() const { return Found; } }; FindClosure FC(OperandToFind, RootKind); visitAll(Root, FC); return FC.Found; } std::optional ScalarEvolution::createNodeForSelectOrPHIInstWithICmpInstCond(Type *Ty, ICmpInst *Cond, Value *TrueVal, Value *FalseVal) { // Try to match some simple smax or umax patterns. auto *ICI = Cond; Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); switch (ICI->getPredicate()) { case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: // a > b ? a+x : b+x -> max(a, b)+x // a > b ? b+x : a+x -> min(a, b)+x if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(Ty)) { bool Signed = ICI->isSigned(); const SCEV *LA = getSCEV(TrueVal); const SCEV *RA = getSCEV(FalseVal); const SCEV *LS = getSCEV(LHS); const SCEV *RS = getSCEV(RHS); if (LA->getType()->isPointerTy()) { // FIXME: Handle cases where LS/RS are pointers not equal to LA/RA. // Need to make sure we can't produce weird expressions involving // negated pointers. if (LA == LS && RA == RS) return Signed ? getSMaxExpr(LS, RS) : getUMaxExpr(LS, RS); if (LA == RS && RA == LS) return Signed ? getSMinExpr(LS, RS) : getUMinExpr(LS, RS); } auto CoerceOperand = [&](const SCEV *Op) -> const SCEV * { if (Op->getType()->isPointerTy()) { Op = getLosslessPtrToIntExpr(Op); if (isa(Op)) return Op; } if (Signed) Op = getNoopOrSignExtend(Op, Ty); else Op = getNoopOrZeroExtend(Op, Ty); return Op; }; LS = CoerceOperand(LS); RS = CoerceOperand(RS); if (isa(LS) || isa(RS)) break; const SCEV *LDiff = getMinusSCEV(LA, LS); const SCEV *RDiff = getMinusSCEV(RA, RS); if (LDiff == RDiff) return getAddExpr(Signed ? getSMaxExpr(LS, RS) : getUMaxExpr(LS, RS), LDiff); LDiff = getMinusSCEV(LA, RS); RDiff = getMinusSCEV(RA, LS); if (LDiff == RDiff) return getAddExpr(Signed ? getSMinExpr(LS, RS) : getUMinExpr(LS, RS), LDiff); } break; case ICmpInst::ICMP_NE: // x != 0 ? x+y : C+y -> x == 0 ? C+y : x+y std::swap(TrueVal, FalseVal); [[fallthrough]]; case ICmpInst::ICMP_EQ: // x == 0 ? C+y : x+y -> umax(x, C)+y iff C u<= 1 if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(Ty) && isa(RHS) && cast(RHS)->isZero()) { const SCEV *X = getNoopOrZeroExtend(getSCEV(LHS), Ty); const SCEV *TrueValExpr = getSCEV(TrueVal); // C+y const SCEV *FalseValExpr = getSCEV(FalseVal); // x+y const SCEV *Y = getMinusSCEV(FalseValExpr, X); // y = (x+y)-x const SCEV *C = getMinusSCEV(TrueValExpr, Y); // C = (C+y)-y if (isa(C) && cast(C)->getAPInt().ule(1)) return getAddExpr(getUMaxExpr(X, C), Y); } // x == 0 ? 0 : umin (..., x, ...) -> umin_seq(x, umin (...)) // x == 0 ? 0 : umin_seq(..., x, ...) -> umin_seq(x, umin_seq(...)) // x == 0 ? 0 : umin (..., umin_seq(..., x, ...), ...) // -> umin_seq(x, umin (..., umin_seq(...), ...)) if (isa(RHS) && cast(RHS)->isZero() && isa(TrueVal) && cast(TrueVal)->isZero()) { const SCEV *X = getSCEV(LHS); while (auto *ZExt = dyn_cast(X)) X = ZExt->getOperand(); if (getTypeSizeInBits(X->getType()) <= getTypeSizeInBits(Ty)) { const SCEV *FalseValExpr = getSCEV(FalseVal); if (SCEVMinMaxExprContains(FalseValExpr, X, scSequentialUMinExpr)) return getUMinExpr(getNoopOrZeroExtend(X, Ty), FalseValExpr, /*Sequential=*/true); } } break; default: break; } return std::nullopt; } static std::optional createNodeForSelectViaUMinSeq(ScalarEvolution *SE, const SCEV *CondExpr, const SCEV *TrueExpr, const SCEV *FalseExpr) { assert(CondExpr->getType()->isIntegerTy(1) && TrueExpr->getType() == FalseExpr->getType() && TrueExpr->getType()->isIntegerTy(1) && "Unexpected operands of a select."); // i1 cond ? i1 x : i1 C --> C + (i1 cond ? (i1 x - i1 C) : i1 0) // --> C + (umin_seq cond, x - C) // // i1 cond ? i1 C : i1 x --> C + (i1 cond ? i1 0 : (i1 x - i1 C)) // --> C + (i1 ~cond ? (i1 x - i1 C) : i1 0) // --> C + (umin_seq ~cond, x - C) // FIXME: while we can't legally model the case where both of the hands // are fully variable, we only require that the *difference* is constant. if (!isa(TrueExpr) && !isa(FalseExpr)) return std::nullopt; const SCEV *X, *C; if (isa(TrueExpr)) { CondExpr = SE->getNotSCEV(CondExpr); X = FalseExpr; C = TrueExpr; } else { X = TrueExpr; C = FalseExpr; } return SE->getAddExpr(C, SE->getUMinExpr(CondExpr, SE->getMinusSCEV(X, C), /*Sequential=*/true)); } static std::optional createNodeForSelectViaUMinSeq(ScalarEvolution *SE, Value *Cond, Value *TrueVal, Value *FalseVal) { if (!isa(TrueVal) && !isa(FalseVal)) return std::nullopt; const auto *SECond = SE->getSCEV(Cond); const auto *SETrue = SE->getSCEV(TrueVal); const auto *SEFalse = SE->getSCEV(FalseVal); return createNodeForSelectViaUMinSeq(SE, SECond, SETrue, SEFalse); } const SCEV *ScalarEvolution::createNodeForSelectOrPHIViaUMinSeq( Value *V, Value *Cond, Value *TrueVal, Value *FalseVal) { assert(Cond->getType()->isIntegerTy(1) && "Select condition is not an i1?"); assert(TrueVal->getType() == FalseVal->getType() && V->getType() == TrueVal->getType() && "Types of select hands and of the result must match."); // For now, only deal with i1-typed `select`s. if (!V->getType()->isIntegerTy(1)) return getUnknown(V); if (std::optional S = createNodeForSelectViaUMinSeq(this, Cond, TrueVal, FalseVal)) return *S; return getUnknown(V); } const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Value *V, Value *Cond, Value *TrueVal, Value *FalseVal) { // Handle "constant" branch or select. This can occur for instance when a // loop pass transforms an inner loop and moves on to process the outer loop. if (auto *CI = dyn_cast(Cond)) return getSCEV(CI->isOne() ? TrueVal : FalseVal); if (auto *I = dyn_cast(V)) { if (auto *ICI = dyn_cast(Cond)) { if (std::optional S = createNodeForSelectOrPHIInstWithICmpInstCond(I->getType(), ICI, TrueVal, FalseVal)) return *S; } } return createNodeForSelectOrPHIViaUMinSeq(V, Cond, TrueVal, FalseVal); } /// Expand GEP instructions into add and multiply operations. This allows them /// to be analyzed by regular SCEV code. const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) { assert(GEP->getSourceElementType()->isSized() && "GEP source element type must be sized"); SmallVector IndexExprs; for (Value *Index : GEP->indices()) IndexExprs.push_back(getSCEV(Index)); return getGEPExpr(GEP, IndexExprs); } APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) { uint64_t BitWidth = getTypeSizeInBits(S->getType()); auto GetShiftedByZeros = [BitWidth](uint32_t TrailingZeros) { return TrailingZeros >= BitWidth ? APInt::getZero(BitWidth) : APInt::getOneBitSet(BitWidth, TrailingZeros); }; auto GetGCDMultiple = [this](const SCEVNAryExpr *N) { // The result is GCD of all operands results. APInt Res = getConstantMultiple(N->getOperand(0)); for (unsigned I = 1, E = N->getNumOperands(); I < E && Res != 1; ++I) Res = APIntOps::GreatestCommonDivisor( Res, getConstantMultiple(N->getOperand(I))); return Res; }; switch (S->getSCEVType()) { case scConstant: return cast(S)->getAPInt(); case scPtrToInt: return getConstantMultiple(cast(S)->getOperand()); case scUDivExpr: case scVScale: return APInt(BitWidth, 1); case scTruncate: { // Only multiples that are a power of 2 will hold after truncation. const SCEVTruncateExpr *T = cast(S); uint32_t TZ = getMinTrailingZeros(T->getOperand()); return GetShiftedByZeros(TZ); } case scZeroExtend: { const SCEVZeroExtendExpr *Z = cast(S); return getConstantMultiple(Z->getOperand()).zext(BitWidth); } case scSignExtend: { const SCEVSignExtendExpr *E = cast(S); return getConstantMultiple(E->getOperand()).sext(BitWidth); } case scMulExpr: { const SCEVMulExpr *M = cast(S); if (M->hasNoUnsignedWrap()) { // The result is the product of all operand results. APInt Res = getConstantMultiple(M->getOperand(0)); for (const SCEV *Operand : M->operands().drop_front()) Res = Res * getConstantMultiple(Operand); return Res; } // If there are no wrap guarentees, find the trailing zeros, which is the // sum of trailing zeros for all its operands. uint32_t TZ = 0; for (const SCEV *Operand : M->operands()) TZ += getMinTrailingZeros(Operand); return GetShiftedByZeros(TZ); } case scAddExpr: case scAddRecExpr: { const SCEVNAryExpr *N = cast(S); if (N->hasNoUnsignedWrap()) return GetGCDMultiple(N); // Find the trailing bits, which is the minimum of its operands. uint32_t TZ = getMinTrailingZeros(N->getOperand(0)); for (const SCEV *Operand : N->operands().drop_front()) TZ = std::min(TZ, getMinTrailingZeros(Operand)); return GetShiftedByZeros(TZ); } case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: return GetGCDMultiple(cast(S)); case scUnknown: { // ask ValueTracking for known bits const SCEVUnknown *U = cast(S); unsigned Known = computeKnownBits(U->getValue(), getDataLayout(), 0, &AC, nullptr, &DT) .countMinTrailingZeros(); return GetShiftedByZeros(Known); } case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } APInt ScalarEvolution::getConstantMultiple(const SCEV *S) { auto I = ConstantMultipleCache.find(S); if (I != ConstantMultipleCache.end()) return I->second; APInt Result = getConstantMultipleImpl(S); auto InsertPair = ConstantMultipleCache.insert({S, Result}); assert(InsertPair.second && "Should insert a new key"); return InsertPair.first->second; } APInt ScalarEvolution::getNonZeroConstantMultiple(const SCEV *S) { APInt Multiple = getConstantMultiple(S); return Multiple == 0 ? APInt(Multiple.getBitWidth(), 1) : Multiple; } uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S) { return std::min(getConstantMultiple(S).countTrailingZeros(), (unsigned)getTypeSizeInBits(S->getType())); } /// Helper method to assign a range to V from metadata present in the IR. static std::optional GetRangeFromMetadata(Value *V) { if (Instruction *I = dyn_cast(V)) if (MDNode *MD = I->getMetadata(LLVMContext::MD_range)) return getConstantRangeFromMetadata(*MD); return std::nullopt; } void ScalarEvolution::setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags) { if (AddRec->getNoWrapFlags(Flags) != Flags) { AddRec->setNoWrapFlags(Flags); UnsignedRanges.erase(AddRec); SignedRanges.erase(AddRec); ConstantMultipleCache.erase(AddRec); } } ConstantRange ScalarEvolution:: getRangeForUnknownRecurrence(const SCEVUnknown *U) { const DataLayout &DL = getDataLayout(); unsigned BitWidth = getTypeSizeInBits(U->getType()); const ConstantRange FullSet(BitWidth, /*isFullSet=*/true); // Match a simple recurrence of the form: , and then // use information about the trip count to improve our available range. Note // that the trip count independent cases are already handled by known bits. // WARNING: The definition of recurrence used here is subtly different than // the one used by AddRec (and thus most of this file). Step is allowed to // be arbitrarily loop varying here, where AddRec allows only loop invariant // and other addrecs in the same loop (for non-affine addrecs). The code // below intentionally handles the case where step is not loop invariant. auto *P = dyn_cast(U->getValue()); if (!P) return FullSet; // Make sure that no Phi input comes from an unreachable block. Otherwise, // even the values that are not available in these blocks may come from them, // and this leads to false-positive recurrence test. for (auto *Pred : predecessors(P->getParent())) if (!DT.isReachableFromEntry(Pred)) return FullSet; BinaryOperator *BO; Value *Start, *Step; if (!matchSimpleRecurrence(P, BO, Start, Step)) return FullSet; // If we found a recurrence in reachable code, we must be in a loop. Note // that BO might be in some subloop of L, and that's completely okay. auto *L = LI.getLoopFor(P->getParent()); assert(L && L->getHeader() == P->getParent()); if (!L->contains(BO->getParent())) // NOTE: This bailout should be an assert instead. However, asserting // the condition here exposes a case where LoopFusion is querying SCEV // with malformed loop information during the midst of the transform. // There doesn't appear to be an obvious fix, so for the moment bailout // until the caller issue can be fixed. PR49566 tracks the bug. return FullSet; // TODO: Extend to other opcodes such as mul, and div switch (BO->getOpcode()) { default: return FullSet; case Instruction::AShr: case Instruction::LShr: case Instruction::Shl: break; }; if (BO->getOperand(0) != P) // TODO: Handle the power function forms some day. return FullSet; unsigned TC = getSmallConstantMaxTripCount(L); if (!TC || TC >= BitWidth) return FullSet; auto KnownStart = computeKnownBits(Start, DL, 0, &AC, nullptr, &DT); auto KnownStep = computeKnownBits(Step, DL, 0, &AC, nullptr, &DT); assert(KnownStart.getBitWidth() == BitWidth && KnownStep.getBitWidth() == BitWidth); // Compute total shift amount, being careful of overflow and bitwidths. auto MaxShiftAmt = KnownStep.getMaxValue(); APInt TCAP(BitWidth, TC-1); bool Overflow = false; auto TotalShift = MaxShiftAmt.umul_ov(TCAP, Overflow); if (Overflow) return FullSet; switch (BO->getOpcode()) { default: llvm_unreachable("filtered out above"); case Instruction::AShr: { // For each ashr, three cases: // shift = 0 => unchanged value // saturation => 0 or -1 // other => a value closer to zero (of the same sign) // Thus, the end value is closer to zero than the start. auto KnownEnd = KnownBits::ashr(KnownStart, KnownBits::makeConstant(TotalShift)); if (KnownStart.isNonNegative()) // Analogous to lshr (simply not yet canonicalized) return ConstantRange::getNonEmpty(KnownEnd.getMinValue(), KnownStart.getMaxValue() + 1); if (KnownStart.isNegative()) // End >=u Start && End <=s Start return ConstantRange::getNonEmpty(KnownStart.getMinValue(), KnownEnd.getMaxValue() + 1); break; } case Instruction::LShr: { // For each lshr, three cases: // shift = 0 => unchanged value // saturation => 0 // other => a smaller positive number // Thus, the low end of the unsigned range is the last value produced. auto KnownEnd = KnownBits::lshr(KnownStart, KnownBits::makeConstant(TotalShift)); return ConstantRange::getNonEmpty(KnownEnd.getMinValue(), KnownStart.getMaxValue() + 1); } case Instruction::Shl: { // Iff no bits are shifted out, value increases on every shift. auto KnownEnd = KnownBits::shl(KnownStart, KnownBits::makeConstant(TotalShift)); if (TotalShift.ult(KnownStart.countMinLeadingZeros())) return ConstantRange(KnownStart.getMinValue(), KnownEnd.getMaxValue() + 1); break; } }; return FullSet; } const ConstantRange & ScalarEvolution::getRangeRefIter(const SCEV *S, ScalarEvolution::RangeSignHint SignHint) { DenseMap &Cache = SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges; SmallVector WorkList; SmallPtrSet Seen; // Add Expr to the worklist, if Expr is either an N-ary expression or a // SCEVUnknown PHI node. auto AddToWorklist = [&WorkList, &Seen, &Cache](const SCEV *Expr) { if (!Seen.insert(Expr).second) return; if (Cache.contains(Expr)) return; switch (Expr->getSCEVType()) { case scUnknown: if (!isa(cast(Expr)->getValue())) break; [[fallthrough]]; case scConstant: case scVScale: case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: case scAddExpr: case scMulExpr: case scUDivExpr: case scAddRecExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: WorkList.push_back(Expr); break; case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } }; AddToWorklist(S); // Build worklist by queuing operands of N-ary expressions and phi nodes. for (unsigned I = 0; I != WorkList.size(); ++I) { const SCEV *P = WorkList[I]; auto *UnknownS = dyn_cast(P); // If it is not a `SCEVUnknown`, just recurse into operands. if (!UnknownS) { for (const SCEV *Op : P->operands()) AddToWorklist(Op); continue; } // `SCEVUnknown`'s require special treatment. if (const PHINode *P = dyn_cast(UnknownS->getValue())) { if (!PendingPhiRangesIter.insert(P).second) continue; for (auto &Op : reverse(P->operands())) AddToWorklist(getSCEV(Op)); } } if (!WorkList.empty()) { // Use getRangeRef to compute ranges for items in the worklist in reverse // order. This will force ranges for earlier operands to be computed before // their users in most cases. for (const SCEV *P : reverse(make_range(WorkList.begin() + 1, WorkList.end()))) { getRangeRef(P, SignHint); if (auto *UnknownS = dyn_cast(P)) if (const PHINode *P = dyn_cast(UnknownS->getValue())) PendingPhiRangesIter.erase(P); } } return getRangeRef(S, SignHint, 0); } /// Determine the range for a particular SCEV. If SignHint is /// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges /// with a "cleaner" unsigned (resp. signed) representation. const ConstantRange &ScalarEvolution::getRangeRef( const SCEV *S, ScalarEvolution::RangeSignHint SignHint, unsigned Depth) { DenseMap &Cache = SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges; ConstantRange::PreferredRangeType RangeType = SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? ConstantRange::Unsigned : ConstantRange::Signed; // See if we've computed this range already. DenseMap::iterator I = Cache.find(S); if (I != Cache.end()) return I->second; if (const SCEVConstant *C = dyn_cast(S)) return setRange(C, SignHint, ConstantRange(C->getAPInt())); // Switch to iteratively computing the range for S, if it is part of a deeply // nested expression. if (Depth > RangeIterThreshold) return getRangeRefIter(S, SignHint); unsigned BitWidth = getTypeSizeInBits(S->getType()); ConstantRange ConservativeResult(BitWidth, /*isFullSet=*/true); using OBO = OverflowingBinaryOperator; // If the value has known zeros, the maximum value will have those known zeros // as well. if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) { APInt Multiple = getNonZeroConstantMultiple(S); APInt Remainder = APInt::getMaxValue(BitWidth).urem(Multiple); if (!Remainder.isZero()) ConservativeResult = ConstantRange(APInt::getMinValue(BitWidth), APInt::getMaxValue(BitWidth) - Remainder + 1); } else { uint32_t TZ = getMinTrailingZeros(S); if (TZ != 0) { ConservativeResult = ConstantRange( APInt::getSignedMinValue(BitWidth), APInt::getSignedMaxValue(BitWidth).ashr(TZ).shl(TZ) + 1); } } switch (S->getSCEVType()) { case scConstant: llvm_unreachable("Already handled above."); case scVScale: return setRange(S, SignHint, getVScaleRange(&F, BitWidth)); case scTruncate: { const SCEVTruncateExpr *Trunc = cast(S); ConstantRange X = getRangeRef(Trunc->getOperand(), SignHint, Depth + 1); return setRange( Trunc, SignHint, ConservativeResult.intersectWith(X.truncate(BitWidth), RangeType)); } case scZeroExtend: { const SCEVZeroExtendExpr *ZExt = cast(S); ConstantRange X = getRangeRef(ZExt->getOperand(), SignHint, Depth + 1); return setRange( ZExt, SignHint, ConservativeResult.intersectWith(X.zeroExtend(BitWidth), RangeType)); } case scSignExtend: { const SCEVSignExtendExpr *SExt = cast(S); ConstantRange X = getRangeRef(SExt->getOperand(), SignHint, Depth + 1); return setRange( SExt, SignHint, ConservativeResult.intersectWith(X.signExtend(BitWidth), RangeType)); } case scPtrToInt: { const SCEVPtrToIntExpr *PtrToInt = cast(S); ConstantRange X = getRangeRef(PtrToInt->getOperand(), SignHint, Depth + 1); return setRange(PtrToInt, SignHint, X); } case scAddExpr: { const SCEVAddExpr *Add = cast(S); ConstantRange X = getRangeRef(Add->getOperand(0), SignHint, Depth + 1); unsigned WrapType = OBO::AnyWrap; if (Add->hasNoSignedWrap()) WrapType |= OBO::NoSignedWrap; if (Add->hasNoUnsignedWrap()) WrapType |= OBO::NoUnsignedWrap; for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i) X = X.addWithNoWrap(getRangeRef(Add->getOperand(i), SignHint, Depth + 1), WrapType, RangeType); return setRange(Add, SignHint, ConservativeResult.intersectWith(X, RangeType)); } case scMulExpr: { const SCEVMulExpr *Mul = cast(S); ConstantRange X = getRangeRef(Mul->getOperand(0), SignHint, Depth + 1); for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i) X = X.multiply(getRangeRef(Mul->getOperand(i), SignHint, Depth + 1)); return setRange(Mul, SignHint, ConservativeResult.intersectWith(X, RangeType)); } case scUDivExpr: { const SCEVUDivExpr *UDiv = cast(S); ConstantRange X = getRangeRef(UDiv->getLHS(), SignHint, Depth + 1); ConstantRange Y = getRangeRef(UDiv->getRHS(), SignHint, Depth + 1); return setRange(UDiv, SignHint, ConservativeResult.intersectWith(X.udiv(Y), RangeType)); } case scAddRecExpr: { const SCEVAddRecExpr *AddRec = cast(S); // If there's no unsigned wrap, the value will never be less than its // initial value. if (AddRec->hasNoUnsignedWrap()) { APInt UnsignedMinValue = getUnsignedRangeMin(AddRec->getStart()); if (!UnsignedMinValue.isZero()) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(UnsignedMinValue, APInt(BitWidth, 0)), RangeType); } // If there's no signed wrap, and all the operands except initial value have // the same sign or zero, the value won't ever be: // 1: smaller than initial value if operands are non negative, // 2: bigger than initial value if operands are non positive. // For both cases, value can not cross signed min/max boundary. if (AddRec->hasNoSignedWrap()) { bool AllNonNeg = true; bool AllNonPos = true; for (unsigned i = 1, e = AddRec->getNumOperands(); i != e; ++i) { if (!isKnownNonNegative(AddRec->getOperand(i))) AllNonNeg = false; if (!isKnownNonPositive(AddRec->getOperand(i))) AllNonPos = false; } if (AllNonNeg) ConservativeResult = ConservativeResult.intersectWith( ConstantRange::getNonEmpty(getSignedRangeMin(AddRec->getStart()), APInt::getSignedMinValue(BitWidth)), RangeType); else if (AllNonPos) ConservativeResult = ConservativeResult.intersectWith( ConstantRange::getNonEmpty(APInt::getSignedMinValue(BitWidth), getSignedRangeMax(AddRec->getStart()) + 1), RangeType); } // TODO: non-affine addrec if (AddRec->isAffine()) { const SCEV *MaxBEScev = getConstantMaxBackedgeTakenCount(AddRec->getLoop()); if (!isa(MaxBEScev)) { APInt MaxBECount = cast(MaxBEScev)->getAPInt(); // Adjust MaxBECount to the same bitwidth as AddRec. We can truncate if // MaxBECount's active bits are all <= AddRec's bit width. if (MaxBECount.getBitWidth() > BitWidth && MaxBECount.getActiveBits() <= BitWidth) MaxBECount = MaxBECount.trunc(BitWidth); else if (MaxBECount.getBitWidth() < BitWidth) MaxBECount = MaxBECount.zext(BitWidth); if (MaxBECount.getBitWidth() == BitWidth) { auto RangeFromAffine = getRangeForAffineAR( AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount); ConservativeResult = ConservativeResult.intersectWith(RangeFromAffine, RangeType); auto RangeFromFactoring = getRangeViaFactoring( AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount); ConservativeResult = ConservativeResult.intersectWith(RangeFromFactoring, RangeType); } } // Now try symbolic BE count and more powerful methods. if (UseExpensiveRangeSharpening) { const SCEV *SymbolicMaxBECount = getSymbolicMaxBackedgeTakenCount(AddRec->getLoop()); if (!isa(SymbolicMaxBECount) && getTypeSizeInBits(MaxBEScev->getType()) <= BitWidth && AddRec->hasNoSelfWrap()) { auto RangeFromAffineNew = getRangeForAffineNoSelfWrappingAR( AddRec, SymbolicMaxBECount, BitWidth, SignHint); ConservativeResult = ConservativeResult.intersectWith(RangeFromAffineNew, RangeType); } } } return setRange(AddRec, SignHint, std::move(ConservativeResult)); } case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: { Intrinsic::ID ID; switch (S->getSCEVType()) { case scUMaxExpr: ID = Intrinsic::umax; break; case scSMaxExpr: ID = Intrinsic::smax; break; case scUMinExpr: case scSequentialUMinExpr: ID = Intrinsic::umin; break; case scSMinExpr: ID = Intrinsic::smin; break; default: llvm_unreachable("Unknown SCEVMinMaxExpr/SCEVSequentialMinMaxExpr."); } const auto *NAry = cast(S); ConstantRange X = getRangeRef(NAry->getOperand(0), SignHint, Depth + 1); for (unsigned i = 1, e = NAry->getNumOperands(); i != e; ++i) X = X.intrinsic( ID, {X, getRangeRef(NAry->getOperand(i), SignHint, Depth + 1)}); return setRange(S, SignHint, ConservativeResult.intersectWith(X, RangeType)); } case scUnknown: { const SCEVUnknown *U = cast(S); Value *V = U->getValue(); // Check if the IR explicitly contains !range metadata. std::optional MDRange = GetRangeFromMetadata(V); if (MDRange) ConservativeResult = ConservativeResult.intersectWith(*MDRange, RangeType); // Use facts about recurrences in the underlying IR. Note that add // recurrences are AddRecExprs and thus don't hit this path. This // primarily handles shift recurrences. auto CR = getRangeForUnknownRecurrence(U); ConservativeResult = ConservativeResult.intersectWith(CR); // See if ValueTracking can give us a useful range. const DataLayout &DL = getDataLayout(); KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, &DT); if (Known.getBitWidth() != BitWidth) Known = Known.zextOrTrunc(BitWidth); // ValueTracking may be able to compute a tighter result for the number of // sign bits than for the value of those sign bits. unsigned NS = ComputeNumSignBits(V, DL, 0, &AC, nullptr, &DT); if (U->getType()->isPointerTy()) { // If the pointer size is larger than the index size type, this can cause // NS to be larger than BitWidth. So compensate for this. unsigned ptrSize = DL.getPointerTypeSizeInBits(U->getType()); int ptrIdxDiff = ptrSize - BitWidth; if (ptrIdxDiff > 0 && ptrSize > BitWidth && NS > (unsigned)ptrIdxDiff) NS -= ptrIdxDiff; } if (NS > 1) { // If we know any of the sign bits, we know all of the sign bits. if (!Known.Zero.getHiBits(NS).isZero()) Known.Zero.setHighBits(NS); if (!Known.One.getHiBits(NS).isZero()) Known.One.setHighBits(NS); } if (Known.getMinValue() != Known.getMaxValue() + 1) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(Known.getMinValue(), Known.getMaxValue() + 1), RangeType); if (NS > 1) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1), APInt::getSignedMaxValue(BitWidth).ashr(NS - 1) + 1), RangeType); if (U->getType()->isPointerTy() && SignHint == HINT_RANGE_UNSIGNED) { // Strengthen the range if the underlying IR value is a // global/alloca/heap allocation using the size of the object. ObjectSizeOpts Opts; Opts.RoundToAlign = false; Opts.NullIsUnknownSize = true; uint64_t ObjSize; if ((isa(V) || isa(V) || isAllocationFn(V, &TLI)) && getObjectSize(V, ObjSize, DL, &TLI, Opts) && ObjSize > 1) { // The highest address the object can start is ObjSize bytes before the // end (unsigned max value). If this value is not a multiple of the // alignment, the last possible start value is the next lowest multiple // of the alignment. Note: The computations below cannot overflow, // because if they would there's no possible start address for the // object. APInt MaxVal = APInt::getMaxValue(BitWidth) - APInt(BitWidth, ObjSize); uint64_t Align = U->getValue()->getPointerAlignment(DL).value(); uint64_t Rem = MaxVal.urem(Align); MaxVal -= APInt(BitWidth, Rem); APInt MinVal = APInt::getZero(BitWidth); if (llvm::isKnownNonZero(V, DL)) MinVal = Align; ConservativeResult = ConservativeResult.intersectWith( - {MinVal, MaxVal + 1}, RangeType); + ConstantRange::getNonEmpty(MinVal, MaxVal + 1), RangeType); } } // A range of Phi is a subset of union of all ranges of its input. if (PHINode *Phi = dyn_cast(V)) { // Make sure that we do not run over cycled Phis. if (PendingPhiRanges.insert(Phi).second) { ConstantRange RangeFromOps(BitWidth, /*isFullSet=*/false); for (const auto &Op : Phi->operands()) { auto OpRange = getRangeRef(getSCEV(Op), SignHint, Depth + 1); RangeFromOps = RangeFromOps.unionWith(OpRange); // No point to continue if we already have a full set. if (RangeFromOps.isFullSet()) break; } ConservativeResult = ConservativeResult.intersectWith(RangeFromOps, RangeType); bool Erased = PendingPhiRanges.erase(Phi); assert(Erased && "Failed to erase Phi properly?"); (void)Erased; } } // vscale can't be equal to zero if (const auto *II = dyn_cast(V)) if (II->getIntrinsicID() == Intrinsic::vscale) { ConstantRange Disallowed = APInt::getZero(BitWidth); ConservativeResult = ConservativeResult.difference(Disallowed); } return setRange(U, SignHint, std::move(ConservativeResult)); } case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } return setRange(S, SignHint, std::move(ConservativeResult)); } // Given a StartRange, Step and MaxBECount for an expression compute a range of // values that the expression can take. Initially, the expression has a value // from StartRange and then is changed by Step up to MaxBECount times. Signed // argument defines if we treat Step as signed or unsigned. static ConstantRange getRangeForAffineARHelper(APInt Step, const ConstantRange &StartRange, const APInt &MaxBECount, bool Signed) { unsigned BitWidth = Step.getBitWidth(); assert(BitWidth == StartRange.getBitWidth() && BitWidth == MaxBECount.getBitWidth() && "mismatched bit widths"); // If either Step or MaxBECount is 0, then the expression won't change, and we // just need to return the initial range. if (Step == 0 || MaxBECount == 0) return StartRange; // If we don't know anything about the initial value (i.e. StartRange is // FullRange), then we don't know anything about the final range either. // Return FullRange. if (StartRange.isFullSet()) return ConstantRange::getFull(BitWidth); // If Step is signed and negative, then we use its absolute value, but we also // note that we're moving in the opposite direction. bool Descending = Signed && Step.isNegative(); if (Signed) // This is correct even for INT_SMIN. Let's look at i8 to illustrate this: // abs(INT_SMIN) = abs(-128) = abs(0x80) = -0x80 = 0x80 = 128. // This equations hold true due to the well-defined wrap-around behavior of // APInt. Step = Step.abs(); // Check if Offset is more than full span of BitWidth. If it is, the // expression is guaranteed to overflow. if (APInt::getMaxValue(StartRange.getBitWidth()).udiv(Step).ult(MaxBECount)) return ConstantRange::getFull(BitWidth); // Offset is by how much the expression can change. Checks above guarantee no // overflow here. APInt Offset = Step * MaxBECount; // Minimum value of the final range will match the minimal value of StartRange // if the expression is increasing and will be decreased by Offset otherwise. // Maximum value of the final range will match the maximal value of StartRange // if the expression is decreasing and will be increased by Offset otherwise. APInt StartLower = StartRange.getLower(); APInt StartUpper = StartRange.getUpper() - 1; APInt MovedBoundary = Descending ? (StartLower - std::move(Offset)) : (StartUpper + std::move(Offset)); // It's possible that the new minimum/maximum value will fall into the initial // range (due to wrap around). This means that the expression can take any // value in this bitwidth, and we have to return full range. if (StartRange.contains(MovedBoundary)) return ConstantRange::getFull(BitWidth); APInt NewLower = Descending ? std::move(MovedBoundary) : std::move(StartLower); APInt NewUpper = Descending ? std::move(StartUpper) : std::move(MovedBoundary); NewUpper += 1; // No overflow detected, return [StartLower, StartUpper + Offset + 1) range. return ConstantRange::getNonEmpty(std::move(NewLower), std::move(NewUpper)); } ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start, const SCEV *Step, const APInt &MaxBECount) { assert(getTypeSizeInBits(Start->getType()) == getTypeSizeInBits(Step->getType()) && getTypeSizeInBits(Start->getType()) == MaxBECount.getBitWidth() && "mismatched bit widths"); // First, consider step signed. ConstantRange StartSRange = getSignedRange(Start); ConstantRange StepSRange = getSignedRange(Step); // If Step can be both positive and negative, we need to find ranges for the // maximum absolute step values in both directions and union them. ConstantRange SR = getRangeForAffineARHelper( StepSRange.getSignedMin(), StartSRange, MaxBECount, /* Signed = */ true); SR = SR.unionWith(getRangeForAffineARHelper(StepSRange.getSignedMax(), StartSRange, MaxBECount, /* Signed = */ true)); // Next, consider step unsigned. ConstantRange UR = getRangeForAffineARHelper( getUnsignedRangeMax(Step), getUnsignedRange(Start), MaxBECount, /* Signed = */ false); // Finally, intersect signed and unsigned ranges. return SR.intersectWith(UR, ConstantRange::Smallest); } ConstantRange ScalarEvolution::getRangeForAffineNoSelfWrappingAR( const SCEVAddRecExpr *AddRec, const SCEV *MaxBECount, unsigned BitWidth, ScalarEvolution::RangeSignHint SignHint) { assert(AddRec->isAffine() && "Non-affine AddRecs are not suppored!\n"); assert(AddRec->hasNoSelfWrap() && "This only works for non-self-wrapping AddRecs!"); const bool IsSigned = SignHint == HINT_RANGE_SIGNED; const SCEV *Step = AddRec->getStepRecurrence(*this); // Only deal with constant step to save compile time. if (!isa(Step)) return ConstantRange::getFull(BitWidth); // Let's make sure that we can prove that we do not self-wrap during // MaxBECount iterations. We need this because MaxBECount is a maximum // iteration count estimate, and we might infer nw from some exit for which we // do not know max exit count (or any other side reasoning). // TODO: Turn into assert at some point. if (getTypeSizeInBits(MaxBECount->getType()) > getTypeSizeInBits(AddRec->getType())) return ConstantRange::getFull(BitWidth); MaxBECount = getNoopOrZeroExtend(MaxBECount, AddRec->getType()); const SCEV *RangeWidth = getMinusOne(AddRec->getType()); const SCEV *StepAbs = getUMinExpr(Step, getNegativeSCEV(Step)); const SCEV *MaxItersWithoutWrap = getUDivExpr(RangeWidth, StepAbs); if (!isKnownPredicateViaConstantRanges(ICmpInst::ICMP_ULE, MaxBECount, MaxItersWithoutWrap)) return ConstantRange::getFull(BitWidth); ICmpInst::Predicate LEPred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; ICmpInst::Predicate GEPred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; const SCEV *End = AddRec->evaluateAtIteration(MaxBECount, *this); // We know that there is no self-wrap. Let's take Start and End values and // look at all intermediate values V1, V2, ..., Vn that IndVar takes during // the iteration. They either lie inside the range [Min(Start, End), // Max(Start, End)] or outside it: // // Case 1: RangeMin ... Start V1 ... VN End ... RangeMax; // Case 2: RangeMin Vk ... V1 Start ... End Vn ... Vk + 1 RangeMax; // // No self wrap flag guarantees that the intermediate values cannot be BOTH // outside and inside the range [Min(Start, End), Max(Start, End)]. Using that // knowledge, let's try to prove that we are dealing with Case 1. It is so if // Start <= End and step is positive, or Start >= End and step is negative. const SCEV *Start = applyLoopGuards(AddRec->getStart(), AddRec->getLoop()); ConstantRange StartRange = getRangeRef(Start, SignHint); ConstantRange EndRange = getRangeRef(End, SignHint); ConstantRange RangeBetween = StartRange.unionWith(EndRange); // If they already cover full iteration space, we will know nothing useful // even if we prove what we want to prove. if (RangeBetween.isFullSet()) return RangeBetween; // Only deal with ranges that do not wrap (i.e. RangeMin < RangeMax). bool IsWrappedSet = IsSigned ? RangeBetween.isSignWrappedSet() : RangeBetween.isWrappedSet(); if (IsWrappedSet) return ConstantRange::getFull(BitWidth); if (isKnownPositive(Step) && isKnownPredicateViaConstantRanges(LEPred, Start, End)) return RangeBetween; if (isKnownNegative(Step) && isKnownPredicateViaConstantRanges(GEPred, Start, End)) return RangeBetween; return ConstantRange::getFull(BitWidth); } ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start, const SCEV *Step, const APInt &MaxBECount) { // RangeOf({C?A:B,+,C?P:Q}) == RangeOf(C?{A,+,P}:{B,+,Q}) // == RangeOf({A,+,P}) union RangeOf({B,+,Q}) unsigned BitWidth = MaxBECount.getBitWidth(); assert(getTypeSizeInBits(Start->getType()) == BitWidth && getTypeSizeInBits(Step->getType()) == BitWidth && "mismatched bit widths"); struct SelectPattern { Value *Condition = nullptr; APInt TrueValue; APInt FalseValue; explicit SelectPattern(ScalarEvolution &SE, unsigned BitWidth, const SCEV *S) { std::optional CastOp; APInt Offset(BitWidth, 0); assert(SE.getTypeSizeInBits(S->getType()) == BitWidth && "Should be!"); // Peel off a constant offset: if (auto *SA = dyn_cast(S)) { // In the future we could consider being smarter here and handle // {Start+Step,+,Step} too. if (SA->getNumOperands() != 2 || !isa(SA->getOperand(0))) return; Offset = cast(SA->getOperand(0))->getAPInt(); S = SA->getOperand(1); } // Peel off a cast operation if (auto *SCast = dyn_cast(S)) { CastOp = SCast->getSCEVType(); S = SCast->getOperand(); } using namespace llvm::PatternMatch; auto *SU = dyn_cast(S); const APInt *TrueVal, *FalseVal; if (!SU || !match(SU->getValue(), m_Select(m_Value(Condition), m_APInt(TrueVal), m_APInt(FalseVal)))) { Condition = nullptr; return; } TrueValue = *TrueVal; FalseValue = *FalseVal; // Re-apply the cast we peeled off earlier if (CastOp) switch (*CastOp) { default: llvm_unreachable("Unknown SCEV cast type!"); case scTruncate: TrueValue = TrueValue.trunc(BitWidth); FalseValue = FalseValue.trunc(BitWidth); break; case scZeroExtend: TrueValue = TrueValue.zext(BitWidth); FalseValue = FalseValue.zext(BitWidth); break; case scSignExtend: TrueValue = TrueValue.sext(BitWidth); FalseValue = FalseValue.sext(BitWidth); break; } // Re-apply the constant offset we peeled off earlier TrueValue += Offset; FalseValue += Offset; } bool isRecognized() { return Condition != nullptr; } }; SelectPattern StartPattern(*this, BitWidth, Start); if (!StartPattern.isRecognized()) return ConstantRange::getFull(BitWidth); SelectPattern StepPattern(*this, BitWidth, Step); if (!StepPattern.isRecognized()) return ConstantRange::getFull(BitWidth); if (StartPattern.Condition != StepPattern.Condition) { // We don't handle this case today; but we could, by considering four // possibilities below instead of two. I'm not sure if there are cases where // that will help over what getRange already does, though. return ConstantRange::getFull(BitWidth); } // NB! Calling ScalarEvolution::getConstant is fine, but we should not try to // construct arbitrary general SCEV expressions here. This function is called // from deep in the call stack, and calling getSCEV (on a sext instruction, // say) can end up caching a suboptimal value. // FIXME: without the explicit `this` receiver below, MSVC errors out with // C2352 and C2512 (otherwise it isn't needed). const SCEV *TrueStart = this->getConstant(StartPattern.TrueValue); const SCEV *TrueStep = this->getConstant(StepPattern.TrueValue); const SCEV *FalseStart = this->getConstant(StartPattern.FalseValue); const SCEV *FalseStep = this->getConstant(StepPattern.FalseValue); ConstantRange TrueRange = this->getRangeForAffineAR(TrueStart, TrueStep, MaxBECount); ConstantRange FalseRange = this->getRangeForAffineAR(FalseStart, FalseStep, MaxBECount); return TrueRange.unionWith(FalseRange); } SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) { if (isa(V)) return SCEV::FlagAnyWrap; const BinaryOperator *BinOp = cast(V); // Return early if there are no flags to propagate to the SCEV. SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (BinOp->hasNoUnsignedWrap()) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); if (BinOp->hasNoSignedWrap()) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); if (Flags == SCEV::FlagAnyWrap) return SCEV::FlagAnyWrap; return isSCEVExprNeverPoison(BinOp) ? Flags : SCEV::FlagAnyWrap; } const Instruction * ScalarEvolution::getNonTrivialDefiningScopeBound(const SCEV *S) { if (auto *AddRec = dyn_cast(S)) return &*AddRec->getLoop()->getHeader()->begin(); if (auto *U = dyn_cast(S)) if (auto *I = dyn_cast(U->getValue())) return I; return nullptr; } const Instruction * ScalarEvolution::getDefiningScopeBound(ArrayRef Ops, bool &Precise) { Precise = true; // Do a bounded search of the def relation of the requested SCEVs. SmallSet Visited; SmallVector Worklist; auto pushOp = [&](const SCEV *S) { if (!Visited.insert(S).second) return; // Threshold of 30 here is arbitrary. if (Visited.size() > 30) { Precise = false; return; } Worklist.push_back(S); }; for (const auto *S : Ops) pushOp(S); const Instruction *Bound = nullptr; while (!Worklist.empty()) { auto *S = Worklist.pop_back_val(); if (auto *DefI = getNonTrivialDefiningScopeBound(S)) { if (!Bound || DT.dominates(Bound, DefI)) Bound = DefI; } else { for (const auto *Op : S->operands()) pushOp(Op); } } return Bound ? Bound : &*F.getEntryBlock().begin(); } const Instruction * ScalarEvolution::getDefiningScopeBound(ArrayRef Ops) { bool Discard; return getDefiningScopeBound(Ops, Discard); } bool ScalarEvolution::isGuaranteedToTransferExecutionTo(const Instruction *A, const Instruction *B) { if (A->getParent() == B->getParent() && isGuaranteedToTransferExecutionToSuccessor(A->getIterator(), B->getIterator())) return true; auto *BLoop = LI.getLoopFor(B->getParent()); if (BLoop && BLoop->getHeader() == B->getParent() && BLoop->getLoopPreheader() == A->getParent() && isGuaranteedToTransferExecutionToSuccessor(A->getIterator(), A->getParent()->end()) && isGuaranteedToTransferExecutionToSuccessor(B->getParent()->begin(), B->getIterator())) return true; return false; } bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) { // Only proceed if we can prove that I does not yield poison. if (!programUndefinedIfPoison(I)) return false; // At this point we know that if I is executed, then it does not wrap // according to at least one of NSW or NUW. If I is not executed, then we do // not know if the calculation that I represents would wrap. Multiple // instructions can map to the same SCEV. If we apply NSW or NUW from I to // the SCEV, we must guarantee no wrapping for that SCEV also when it is // derived from other instructions that map to the same SCEV. We cannot make // that guarantee for cases where I is not executed. So we need to find a // upper bound on the defining scope for the SCEV, and prove that I is // executed every time we enter that scope. When the bounding scope is a // loop (the common case), this is equivalent to proving I executes on every // iteration of that loop. SmallVector SCEVOps; for (const Use &Op : I->operands()) { // I could be an extractvalue from a call to an overflow intrinsic. // TODO: We can do better here in some cases. if (isSCEVable(Op->getType())) SCEVOps.push_back(getSCEV(Op)); } auto *DefI = getDefiningScopeBound(SCEVOps); return isGuaranteedToTransferExecutionTo(DefI, I); } bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) { // If we know that \c I can never be poison period, then that's enough. if (isSCEVExprNeverPoison(I)) return true; // If the loop only has one exit, then we know that, if the loop is entered, // any instruction dominating that exit will be executed. If any such // instruction would result in UB, the addrec cannot be poison. // // This is basically the same reasoning as in isSCEVExprNeverPoison(), but // also handles uses outside the loop header (they just need to dominate the // single exit). auto *ExitingBB = L->getExitingBlock(); if (!ExitingBB || !loopHasNoAbnormalExits(L)) return false; SmallPtrSet KnownPoison; SmallVector Worklist; // We start by assuming \c I, the post-inc add recurrence, is poison. Only // things that are known to be poison under that assumption go on the // Worklist. KnownPoison.insert(I); Worklist.push_back(I); while (!Worklist.empty()) { const Instruction *Poison = Worklist.pop_back_val(); for (const Use &U : Poison->uses()) { const Instruction *PoisonUser = cast(U.getUser()); if (mustTriggerUB(PoisonUser, KnownPoison) && DT.dominates(PoisonUser->getParent(), ExitingBB)) return true; if (propagatesPoison(U) && L->contains(PoisonUser)) if (KnownPoison.insert(PoisonUser).second) Worklist.push_back(PoisonUser); } } return false; } ScalarEvolution::LoopProperties ScalarEvolution::getLoopProperties(const Loop *L) { using LoopProperties = ScalarEvolution::LoopProperties; auto Itr = LoopPropertiesCache.find(L); if (Itr == LoopPropertiesCache.end()) { auto HasSideEffects = [](Instruction *I) { if (auto *SI = dyn_cast(I)) return !SI->isSimple(); return I->mayThrow() || I->mayWriteToMemory(); }; LoopProperties LP = {/* HasNoAbnormalExits */ true, /*HasNoSideEffects*/ true}; for (auto *BB : L->getBlocks()) for (auto &I : *BB) { if (!isGuaranteedToTransferExecutionToSuccessor(&I)) LP.HasNoAbnormalExits = false; if (HasSideEffects(&I)) LP.HasNoSideEffects = false; if (!LP.HasNoAbnormalExits && !LP.HasNoSideEffects) break; // We're already as pessimistic as we can get. } auto InsertPair = LoopPropertiesCache.insert({L, LP}); assert(InsertPair.second && "We just checked!"); Itr = InsertPair.first; } return Itr->second; } bool ScalarEvolution::loopIsFiniteByAssumption(const Loop *L) { // A mustprogress loop without side effects must be finite. // TODO: The check used here is very conservative. It's only *specific* // side effects which are well defined in infinite loops. return isFinite(L) || (isMustProgress(L) && loopHasNoSideEffects(L)); } const SCEV *ScalarEvolution::createSCEVIter(Value *V) { // Worklist item with a Value and a bool indicating whether all operands have // been visited already. using PointerTy = PointerIntPair; SmallVector Stack; Stack.emplace_back(V, true); Stack.emplace_back(V, false); while (!Stack.empty()) { auto E = Stack.pop_back_val(); Value *CurV = E.getPointer(); if (getExistingSCEV(CurV)) continue; SmallVector Ops; const SCEV *CreatedSCEV = nullptr; // If all operands have been visited already, create the SCEV. if (E.getInt()) { CreatedSCEV = createSCEV(CurV); } else { // Otherwise get the operands we need to create SCEV's for before creating // the SCEV for CurV. If the SCEV for CurV can be constructed trivially, // just use it. CreatedSCEV = getOperandsToCreate(CurV, Ops); } if (CreatedSCEV) { insertValueToMap(CurV, CreatedSCEV); } else { // Queue CurV for SCEV creation, followed by its's operands which need to // be constructed first. Stack.emplace_back(CurV, true); for (Value *Op : Ops) Stack.emplace_back(Op, false); } } return getExistingSCEV(V); } const SCEV * ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl &Ops) { if (!isSCEVable(V->getType())) return getUnknown(V); if (Instruction *I = dyn_cast(V)) { // Don't attempt to analyze instructions in blocks that aren't // reachable. Such instructions don't matter, and they aren't required // to obey basic rules for definitions dominating uses which this // analysis depends on. if (!DT.isReachableFromEntry(I->getParent())) return getUnknown(PoisonValue::get(V->getType())); } else if (ConstantInt *CI = dyn_cast(V)) return getConstant(CI); else if (isa(V)) return getUnknown(V); else if (!isa(V)) return getUnknown(V); Operator *U = cast(V); if (auto BO = MatchBinaryOp(U, getDataLayout(), AC, DT, dyn_cast(V))) { bool IsConstArg = isa(BO->RHS); switch (BO->Opcode) { case Instruction::Add: case Instruction::Mul: { // For additions and multiplications, traverse add/mul chains for which we // can potentially create a single SCEV, to reduce the number of // get{Add,Mul}Expr calls. do { if (BO->Op) { if (BO->Op != V && getExistingSCEV(BO->Op)) { Ops.push_back(BO->Op); break; } } Ops.push_back(BO->RHS); auto NewBO = MatchBinaryOp(BO->LHS, getDataLayout(), AC, DT, dyn_cast(V)); if (!NewBO || (BO->Opcode == Instruction::Add && (NewBO->Opcode != Instruction::Add && NewBO->Opcode != Instruction::Sub)) || (BO->Opcode == Instruction::Mul && NewBO->Opcode != Instruction::Mul)) { Ops.push_back(BO->LHS); break; } // CreateSCEV calls getNoWrapFlagsFromUB, which under certain conditions // requires a SCEV for the LHS. if (BO->Op && (BO->IsNSW || BO->IsNUW)) { auto *I = dyn_cast(BO->Op); if (I && programUndefinedIfPoison(I)) { Ops.push_back(BO->LHS); break; } } BO = NewBO; } while (true); return nullptr; } case Instruction::Sub: case Instruction::UDiv: case Instruction::URem: break; case Instruction::AShr: case Instruction::Shl: case Instruction::Xor: if (!IsConstArg) return nullptr; break; case Instruction::And: case Instruction::Or: if (!IsConstArg && !BO->LHS->getType()->isIntegerTy(1)) return nullptr; break; case Instruction::LShr: return getUnknown(V); default: llvm_unreachable("Unhandled binop"); break; } Ops.push_back(BO->LHS); Ops.push_back(BO->RHS); return nullptr; } switch (U->getOpcode()) { case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: case Instruction::PtrToInt: Ops.push_back(U->getOperand(0)); return nullptr; case Instruction::BitCast: if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType())) { Ops.push_back(U->getOperand(0)); return nullptr; } return getUnknown(V); case Instruction::SDiv: case Instruction::SRem: Ops.push_back(U->getOperand(0)); Ops.push_back(U->getOperand(1)); return nullptr; case Instruction::GetElementPtr: assert(cast(U)->getSourceElementType()->isSized() && "GEP source element type must be sized"); for (Value *Index : U->operands()) Ops.push_back(Index); return nullptr; case Instruction::IntToPtr: return getUnknown(V); case Instruction::PHI: // Keep constructing SCEVs' for phis recursively for now. return nullptr; case Instruction::Select: { // Check if U is a select that can be simplified to a SCEVUnknown. auto CanSimplifyToUnknown = [this, U]() { if (U->getType()->isIntegerTy(1) || isa(U->getOperand(0))) return false; auto *ICI = dyn_cast(U->getOperand(0)); if (!ICI) return false; Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); if (ICI->getPredicate() == CmpInst::ICMP_EQ || ICI->getPredicate() == CmpInst::ICMP_NE) { if (!(isa(RHS) && cast(RHS)->isZero())) return true; } else if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(U->getType())) return true; return false; }; if (CanSimplifyToUnknown()) return getUnknown(U); for (Value *Inc : U->operands()) Ops.push_back(Inc); return nullptr; break; } case Instruction::Call: case Instruction::Invoke: if (Value *RV = cast(U)->getReturnedArgOperand()) { Ops.push_back(RV); return nullptr; } if (auto *II = dyn_cast(U)) { switch (II->getIntrinsicID()) { case Intrinsic::abs: Ops.push_back(II->getArgOperand(0)); return nullptr; case Intrinsic::umax: case Intrinsic::umin: case Intrinsic::smax: case Intrinsic::smin: case Intrinsic::usub_sat: case Intrinsic::uadd_sat: Ops.push_back(II->getArgOperand(0)); Ops.push_back(II->getArgOperand(1)); return nullptr; case Intrinsic::start_loop_iterations: case Intrinsic::annotation: case Intrinsic::ptr_annotation: Ops.push_back(II->getArgOperand(0)); return nullptr; default: break; } } break; } return nullptr; } const SCEV *ScalarEvolution::createSCEV(Value *V) { if (!isSCEVable(V->getType())) return getUnknown(V); if (Instruction *I = dyn_cast(V)) { // Don't attempt to analyze instructions in blocks that aren't // reachable. Such instructions don't matter, and they aren't required // to obey basic rules for definitions dominating uses which this // analysis depends on. if (!DT.isReachableFromEntry(I->getParent())) return getUnknown(PoisonValue::get(V->getType())); } else if (ConstantInt *CI = dyn_cast(V)) return getConstant(CI); else if (isa(V)) return getUnknown(V); else if (!isa(V)) return getUnknown(V); const SCEV *LHS; const SCEV *RHS; Operator *U = cast(V); if (auto BO = MatchBinaryOp(U, getDataLayout(), AC, DT, dyn_cast(V))) { switch (BO->Opcode) { case Instruction::Add: { // The simple thing to do would be to just call getSCEV on both operands // and call getAddExpr with the result. However if we're looking at a // bunch of things all added together, this can be quite inefficient, // because it leads to N-1 getAddExpr calls for N ultimate operands. // Instead, gather up all the operands and make a single getAddExpr call. // LLVM IR canonical form means we need only traverse the left operands. SmallVector AddOps; do { if (BO->Op) { if (auto *OpSCEV = getExistingSCEV(BO->Op)) { AddOps.push_back(OpSCEV); break; } // If a NUW or NSW flag can be applied to the SCEV for this // addition, then compute the SCEV for this addition by itself // with a separate call to getAddExpr. We need to do that // instead of pushing the operands of the addition onto AddOps, // since the flags are only known to apply to this particular // addition - they may not apply to other additions that can be // formed with operands from AddOps. const SCEV *RHS = getSCEV(BO->RHS); SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op); if (Flags != SCEV::FlagAnyWrap) { const SCEV *LHS = getSCEV(BO->LHS); if (BO->Opcode == Instruction::Sub) AddOps.push_back(getMinusSCEV(LHS, RHS, Flags)); else AddOps.push_back(getAddExpr(LHS, RHS, Flags)); break; } } if (BO->Opcode == Instruction::Sub) AddOps.push_back(getNegativeSCEV(getSCEV(BO->RHS))); else AddOps.push_back(getSCEV(BO->RHS)); auto NewBO = MatchBinaryOp(BO->LHS, getDataLayout(), AC, DT, dyn_cast(V)); if (!NewBO || (NewBO->Opcode != Instruction::Add && NewBO->Opcode != Instruction::Sub)) { AddOps.push_back(getSCEV(BO->LHS)); break; } BO = NewBO; } while (true); return getAddExpr(AddOps); } case Instruction::Mul: { SmallVector MulOps; do { if (BO->Op) { if (auto *OpSCEV = getExistingSCEV(BO->Op)) { MulOps.push_back(OpSCEV); break; } SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op); if (Flags != SCEV::FlagAnyWrap) { LHS = getSCEV(BO->LHS); RHS = getSCEV(BO->RHS); MulOps.push_back(getMulExpr(LHS, RHS, Flags)); break; } } MulOps.push_back(getSCEV(BO->RHS)); auto NewBO = MatchBinaryOp(BO->LHS, getDataLayout(), AC, DT, dyn_cast(V)); if (!NewBO || NewBO->Opcode != Instruction::Mul) { MulOps.push_back(getSCEV(BO->LHS)); break; } BO = NewBO; } while (true); return getMulExpr(MulOps); } case Instruction::UDiv: LHS = getSCEV(BO->LHS); RHS = getSCEV(BO->RHS); return getUDivExpr(LHS, RHS); case Instruction::URem: LHS = getSCEV(BO->LHS); RHS = getSCEV(BO->RHS); return getURemExpr(LHS, RHS); case Instruction::Sub: { SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (BO->Op) Flags = getNoWrapFlagsFromUB(BO->Op); LHS = getSCEV(BO->LHS); RHS = getSCEV(BO->RHS); return getMinusSCEV(LHS, RHS, Flags); } case Instruction::And: // For an expression like x&255 that merely masks off the high bits, // use zext(trunc(x)) as the SCEV expression. if (ConstantInt *CI = dyn_cast(BO->RHS)) { if (CI->isZero()) return getSCEV(BO->RHS); if (CI->isMinusOne()) return getSCEV(BO->LHS); const APInt &A = CI->getValue(); // Instcombine's ShrinkDemandedConstant may strip bits out of // constants, obscuring what would otherwise be a low-bits mask. // Use computeKnownBits to compute what ShrinkDemandedConstant // knew about to reconstruct a low-bits mask value. unsigned LZ = A.countl_zero(); unsigned TZ = A.countr_zero(); unsigned BitWidth = A.getBitWidth(); KnownBits Known(BitWidth); computeKnownBits(BO->LHS, Known, getDataLayout(), 0, &AC, nullptr, &DT); APInt EffectiveMask = APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ); if ((LZ != 0 || TZ != 0) && !((~A & ~Known.Zero) & EffectiveMask)) { const SCEV *MulCount = getConstant(APInt::getOneBitSet(BitWidth, TZ)); const SCEV *LHS = getSCEV(BO->LHS); const SCEV *ShiftedLHS = nullptr; if (auto *LHSMul = dyn_cast(LHS)) { if (auto *OpC = dyn_cast(LHSMul->getOperand(0))) { // For an expression like (x * 8) & 8, simplify the multiply. unsigned MulZeros = OpC->getAPInt().countr_zero(); unsigned GCD = std::min(MulZeros, TZ); APInt DivAmt = APInt::getOneBitSet(BitWidth, TZ - GCD); SmallVector MulOps; MulOps.push_back(getConstant(OpC->getAPInt().lshr(GCD))); append_range(MulOps, LHSMul->operands().drop_front()); auto *NewMul = getMulExpr(MulOps, LHSMul->getNoWrapFlags()); ShiftedLHS = getUDivExpr(NewMul, getConstant(DivAmt)); } } if (!ShiftedLHS) ShiftedLHS = getUDivExpr(LHS, MulCount); return getMulExpr( getZeroExtendExpr( getTruncateExpr(ShiftedLHS, IntegerType::get(getContext(), BitWidth - LZ - TZ)), BO->LHS->getType()), MulCount); } } // Binary `and` is a bit-wise `umin`. if (BO->LHS->getType()->isIntegerTy(1)) { LHS = getSCEV(BO->LHS); RHS = getSCEV(BO->RHS); return getUMinExpr(LHS, RHS); } break; case Instruction::Or: // Binary `or` is a bit-wise `umax`. if (BO->LHS->getType()->isIntegerTy(1)) { LHS = getSCEV(BO->LHS); RHS = getSCEV(BO->RHS); return getUMaxExpr(LHS, RHS); } break; case Instruction::Xor: if (ConstantInt *CI = dyn_cast(BO->RHS)) { // If the RHS of xor is -1, then this is a not operation. if (CI->isMinusOne()) return getNotSCEV(getSCEV(BO->LHS)); // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask. // This is a variant of the check for xor with -1, and it handles // the case where instcombine has trimmed non-demanded bits out // of an xor with -1. if (auto *LBO = dyn_cast(BO->LHS)) if (ConstantInt *LCI = dyn_cast(LBO->getOperand(1))) if (LBO->getOpcode() == Instruction::And && LCI->getValue() == CI->getValue()) if (const SCEVZeroExtendExpr *Z = dyn_cast(getSCEV(BO->LHS))) { Type *UTy = BO->LHS->getType(); const SCEV *Z0 = Z->getOperand(); Type *Z0Ty = Z0->getType(); unsigned Z0TySize = getTypeSizeInBits(Z0Ty); // If C is a low-bits mask, the zero extend is serving to // mask off the high bits. Complement the operand and // re-apply the zext. if (CI->getValue().isMask(Z0TySize)) return getZeroExtendExpr(getNotSCEV(Z0), UTy); // If C is a single bit, it may be in the sign-bit position // before the zero-extend. In this case, represent the xor // using an add, which is equivalent, and re-apply the zext. APInt Trunc = CI->getValue().trunc(Z0TySize); if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() && Trunc.isSignMask()) return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)), UTy); } } break; case Instruction::Shl: // Turn shift left of a constant amount into a multiply. if (ConstantInt *SA = dyn_cast(BO->RHS)) { uint32_t BitWidth = cast(SA->getType())->getBitWidth(); // If the shift count is not less than the bitwidth, the result of // the shift is undefined. Don't try to analyze it, because the // resolution chosen here may differ from the resolution chosen in // other parts of the compiler. if (SA->getValue().uge(BitWidth)) break; // We can safely preserve the nuw flag in all cases. It's also safe to // turn a nuw nsw shl into a nuw nsw mul. However, nsw in isolation // requires special handling. It can be preserved as long as we're not // left shifting by bitwidth - 1. auto Flags = SCEV::FlagAnyWrap; if (BO->Op) { auto MulFlags = getNoWrapFlagsFromUB(BO->Op); if ((MulFlags & SCEV::FlagNSW) && ((MulFlags & SCEV::FlagNUW) || SA->getValue().ult(BitWidth - 1))) Flags = (SCEV::NoWrapFlags)(Flags | SCEV::FlagNSW); if (MulFlags & SCEV::FlagNUW) Flags = (SCEV::NoWrapFlags)(Flags | SCEV::FlagNUW); } ConstantInt *X = ConstantInt::get( getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue())); return getMulExpr(getSCEV(BO->LHS), getConstant(X), Flags); } break; case Instruction::AShr: { // AShr X, C, where C is a constant. ConstantInt *CI = dyn_cast(BO->RHS); if (!CI) break; Type *OuterTy = BO->LHS->getType(); uint64_t BitWidth = getTypeSizeInBits(OuterTy); // If the shift count is not less than the bitwidth, the result of // the shift is undefined. Don't try to analyze it, because the // resolution chosen here may differ from the resolution chosen in // other parts of the compiler. if (CI->getValue().uge(BitWidth)) break; if (CI->isZero()) return getSCEV(BO->LHS); // shift by zero --> noop uint64_t AShrAmt = CI->getZExtValue(); Type *TruncTy = IntegerType::get(getContext(), BitWidth - AShrAmt); Operator *L = dyn_cast(BO->LHS); if (L && L->getOpcode() == Instruction::Shl) { // X = Shl A, n // Y = AShr X, m // Both n and m are constant. const SCEV *ShlOp0SCEV = getSCEV(L->getOperand(0)); if (L->getOperand(1) == BO->RHS) // For a two-shift sext-inreg, i.e. n = m, // use sext(trunc(x)) as the SCEV expression. return getSignExtendExpr( getTruncateExpr(ShlOp0SCEV, TruncTy), OuterTy); ConstantInt *ShlAmtCI = dyn_cast(L->getOperand(1)); if (ShlAmtCI && ShlAmtCI->getValue().ult(BitWidth)) { uint64_t ShlAmt = ShlAmtCI->getZExtValue(); if (ShlAmt > AShrAmt) { // When n > m, use sext(mul(trunc(x), 2^(n-m)))) as the SCEV // expression. We already checked that ShlAmt < BitWidth, so // the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as // ShlAmt - AShrAmt < Amt. APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt, ShlAmt - AShrAmt); return getSignExtendExpr( getMulExpr(getTruncateExpr(ShlOp0SCEV, TruncTy), getConstant(Mul)), OuterTy); } } } break; } } } switch (U->getOpcode()) { case Instruction::Trunc: return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType()); case Instruction::ZExt: return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType()); case Instruction::SExt: if (auto BO = MatchBinaryOp(U->getOperand(0), getDataLayout(), AC, DT, dyn_cast(V))) { // The NSW flag of a subtract does not always survive the conversion to // A + (-1)*B. By pushing sign extension onto its operands we are much // more likely to preserve NSW and allow later AddRec optimisations. // // NOTE: This is effectively duplicating this logic from getSignExtend: // sext((A + B + ...)) --> (sext(A) + sext(B) + ...) // but by that point the NSW information has potentially been lost. if (BO->Opcode == Instruction::Sub && BO->IsNSW) { Type *Ty = U->getType(); auto *V1 = getSignExtendExpr(getSCEV(BO->LHS), Ty); auto *V2 = getSignExtendExpr(getSCEV(BO->RHS), Ty); return getMinusSCEV(V1, V2, SCEV::FlagNSW); } } return getSignExtendExpr(getSCEV(U->getOperand(0)), U->getType()); case Instruction::BitCast: // BitCasts are no-op casts so we just eliminate the cast. if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType())) return getSCEV(U->getOperand(0)); break; case Instruction::PtrToInt: { // Pointer to integer cast is straight-forward, so do model it. const SCEV *Op = getSCEV(U->getOperand(0)); Type *DstIntTy = U->getType(); // But only if effective SCEV (integer) type is wide enough to represent // all possible pointer values. const SCEV *IntOp = getPtrToIntExpr(Op, DstIntTy); if (isa(IntOp)) return getUnknown(V); return IntOp; } case Instruction::IntToPtr: // Just don't deal with inttoptr casts. return getUnknown(V); case Instruction::SDiv: // If both operands are non-negative, this is just an udiv. if (isKnownNonNegative(getSCEV(U->getOperand(0))) && isKnownNonNegative(getSCEV(U->getOperand(1)))) return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1))); break; case Instruction::SRem: // If both operands are non-negative, this is just an urem. if (isKnownNonNegative(getSCEV(U->getOperand(0))) && isKnownNonNegative(getSCEV(U->getOperand(1)))) return getURemExpr(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1))); break; case Instruction::GetElementPtr: return createNodeForGEP(cast(U)); case Instruction::PHI: return createNodeForPHI(cast(U)); case Instruction::Select: return createNodeForSelectOrPHI(U, U->getOperand(0), U->getOperand(1), U->getOperand(2)); case Instruction::Call: case Instruction::Invoke: if (Value *RV = cast(U)->getReturnedArgOperand()) return getSCEV(RV); if (auto *II = dyn_cast(U)) { switch (II->getIntrinsicID()) { case Intrinsic::abs: return getAbsExpr( getSCEV(II->getArgOperand(0)), /*IsNSW=*/cast(II->getArgOperand(1))->isOne()); case Intrinsic::umax: LHS = getSCEV(II->getArgOperand(0)); RHS = getSCEV(II->getArgOperand(1)); return getUMaxExpr(LHS, RHS); case Intrinsic::umin: LHS = getSCEV(II->getArgOperand(0)); RHS = getSCEV(II->getArgOperand(1)); return getUMinExpr(LHS, RHS); case Intrinsic::smax: LHS = getSCEV(II->getArgOperand(0)); RHS = getSCEV(II->getArgOperand(1)); return getSMaxExpr(LHS, RHS); case Intrinsic::smin: LHS = getSCEV(II->getArgOperand(0)); RHS = getSCEV(II->getArgOperand(1)); return getSMinExpr(LHS, RHS); case Intrinsic::usub_sat: { const SCEV *X = getSCEV(II->getArgOperand(0)); const SCEV *Y = getSCEV(II->getArgOperand(1)); const SCEV *ClampedY = getUMinExpr(X, Y); return getMinusSCEV(X, ClampedY, SCEV::FlagNUW); } case Intrinsic::uadd_sat: { const SCEV *X = getSCEV(II->getArgOperand(0)); const SCEV *Y = getSCEV(II->getArgOperand(1)); const SCEV *ClampedX = getUMinExpr(X, getNotSCEV(Y)); return getAddExpr(ClampedX, Y, SCEV::FlagNUW); } case Intrinsic::start_loop_iterations: case Intrinsic::annotation: case Intrinsic::ptr_annotation: // A start_loop_iterations or llvm.annotation or llvm.prt.annotation is // just eqivalent to the first operand for SCEV purposes. return getSCEV(II->getArgOperand(0)); case Intrinsic::vscale: return getVScale(II->getType()); default: break; } } break; } return getUnknown(V); } //===----------------------------------------------------------------------===// // Iteration Count Computation Code // const SCEV *ScalarEvolution::getTripCountFromExitCount(const SCEV *ExitCount) { if (isa(ExitCount)) return getCouldNotCompute(); auto *ExitCountType = ExitCount->getType(); assert(ExitCountType->isIntegerTy()); auto *EvalTy = Type::getIntNTy(ExitCountType->getContext(), 1 + ExitCountType->getScalarSizeInBits()); return getTripCountFromExitCount(ExitCount, EvalTy, nullptr); } const SCEV *ScalarEvolution::getTripCountFromExitCount(const SCEV *ExitCount, Type *EvalTy, const Loop *L) { if (isa(ExitCount)) return getCouldNotCompute(); unsigned ExitCountSize = getTypeSizeInBits(ExitCount->getType()); unsigned EvalSize = EvalTy->getPrimitiveSizeInBits(); auto CanAddOneWithoutOverflow = [&]() { ConstantRange ExitCountRange = getRangeRef(ExitCount, RangeSignHint::HINT_RANGE_UNSIGNED); if (!ExitCountRange.contains(APInt::getMaxValue(ExitCountSize))) return true; return L && isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount, getMinusOne(ExitCount->getType())); }; // If we need to zero extend the backedge count, check if we can add one to // it prior to zero extending without overflow. Provided this is safe, it // allows better simplification of the +1. if (EvalSize > ExitCountSize && CanAddOneWithoutOverflow()) return getZeroExtendExpr( getAddExpr(ExitCount, getOne(ExitCount->getType())), EvalTy); // Get the total trip count from the count by adding 1. This may wrap. return getAddExpr(getTruncateOrZeroExtend(ExitCount, EvalTy), getOne(EvalTy)); } static unsigned getConstantTripCount(const SCEVConstant *ExitCount) { if (!ExitCount) return 0; ConstantInt *ExitConst = ExitCount->getValue(); // Guard against huge trip counts. if (ExitConst->getValue().getActiveBits() > 32) return 0; // In case of integer overflow, this returns 0, which is correct. return ((unsigned)ExitConst->getZExtValue()) + 1; } unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) { auto *ExitCount = dyn_cast(getBackedgeTakenCount(L, Exact)); return getConstantTripCount(ExitCount); } unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L, const BasicBlock *ExitingBlock) { assert(ExitingBlock && "Must pass a non-null exiting block!"); assert(L->isLoopExiting(ExitingBlock) && "Exiting block must actually branch out of the loop!"); const SCEVConstant *ExitCount = dyn_cast(getExitCount(L, ExitingBlock)); return getConstantTripCount(ExitCount); } unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) { const auto *MaxExitCount = dyn_cast(getConstantMaxBackedgeTakenCount(L)); return getConstantTripCount(MaxExitCount); } unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); std::optional Res; for (auto *ExitingBB : ExitingBlocks) { unsigned Multiple = getSmallConstantTripMultiple(L, ExitingBB); if (!Res) Res = Multiple; Res = (unsigned)std::gcd(*Res, Multiple); } return Res.value_or(1); } unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount) { if (ExitCount == getCouldNotCompute()) return 1; // Get the trip count const SCEV *TCExpr = getTripCountFromExitCount(applyLoopGuards(ExitCount, L)); APInt Multiple = getNonZeroConstantMultiple(TCExpr); // If a trip multiple is huge (>=2^32), the trip count is still divisible by // the greatest power of 2 divisor less than 2^32. return Multiple.getActiveBits() > 32 ? 1U << std::min((unsigned)31, Multiple.countTrailingZeros()) : (unsigned)Multiple.zextOrTrunc(32).getZExtValue(); } /// Returns the largest constant divisor of the trip count of this loop as a /// normal unsigned value, if possible. This means that the actual trip count is /// always a multiple of the returned value (don't forget the trip count could /// very well be zero as well!). /// /// Returns 1 if the trip count is unknown or not guaranteed to be the /// multiple of a constant (which is also the case if the trip count is simply /// constant, use getSmallConstantTripCount for that case), Will also return 1 /// if the trip count is very large (>= 2^32). /// /// As explained in the comments for getSmallConstantTripCount, this assumes /// that control exits the loop via ExitingBlock. unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L, const BasicBlock *ExitingBlock) { assert(ExitingBlock && "Must pass a non-null exiting block!"); assert(L->isLoopExiting(ExitingBlock) && "Exiting block must actually branch out of the loop!"); const SCEV *ExitCount = getExitCount(L, ExitingBlock); return getSmallConstantTripMultiple(L, ExitCount); } const SCEV *ScalarEvolution::getExitCount(const Loop *L, const BasicBlock *ExitingBlock, ExitCountKind Kind) { switch (Kind) { case Exact: return getBackedgeTakenInfo(L).getExact(ExitingBlock, this); case SymbolicMaximum: return getBackedgeTakenInfo(L).getSymbolicMax(ExitingBlock, this); case ConstantMaximum: return getBackedgeTakenInfo(L).getConstantMax(ExitingBlock, this); }; llvm_unreachable("Invalid ExitCountKind!"); } const SCEV * ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L, SmallVector &Preds) { return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds); } const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L, ExitCountKind Kind) { switch (Kind) { case Exact: return getBackedgeTakenInfo(L).getExact(L, this); case ConstantMaximum: return getBackedgeTakenInfo(L).getConstantMax(this); case SymbolicMaximum: return getBackedgeTakenInfo(L).getSymbolicMax(L, this); }; llvm_unreachable("Invalid ExitCountKind!"); } bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) { return getBackedgeTakenInfo(L).isConstantMaxOrZero(this); } /// Push PHI nodes in the header of the given loop onto the given Worklist. static void PushLoopPHIs(const Loop *L, SmallVectorImpl &Worklist, SmallPtrSetImpl &Visited) { BasicBlock *Header = L->getHeader(); // Push all Loop-header PHIs onto the Worklist stack. for (PHINode &PN : Header->phis()) if (Visited.insert(&PN).second) Worklist.push_back(&PN); } const ScalarEvolution::BackedgeTakenInfo & ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) { auto &BTI = getBackedgeTakenInfo(L); if (BTI.hasFullInfo()) return BTI; auto Pair = PredicatedBackedgeTakenCounts.insert({L, BackedgeTakenInfo()}); if (!Pair.second) return Pair.first->second; BackedgeTakenInfo Result = computeBackedgeTakenCount(L, /*AllowPredicates=*/true); return PredicatedBackedgeTakenCounts.find(L)->second = std::move(Result); } ScalarEvolution::BackedgeTakenInfo & ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { // Initially insert an invalid entry for this loop. If the insertion // succeeds, proceed to actually compute a backedge-taken count and // update the value. The temporary CouldNotCompute value tells SCEV // code elsewhere that it shouldn't attempt to request a new // backedge-taken count, which could result in infinite recursion. std::pair::iterator, bool> Pair = BackedgeTakenCounts.insert({L, BackedgeTakenInfo()}); if (!Pair.second) return Pair.first->second; // computeBackedgeTakenCount may allocate memory for its result. Inserting it // into the BackedgeTakenCounts map transfers ownership. Otherwise, the result // must be cleared in this scope. BackedgeTakenInfo Result = computeBackedgeTakenCount(L); // Now that we know more about the trip count for this loop, forget any // existing SCEV values for PHI nodes in this loop since they are only // conservative estimates made without the benefit of trip count // information. This invalidation is not necessary for correctness, and is // only done to produce more precise results. if (Result.hasAnyInfo()) { // Invalidate any expression using an addrec in this loop. SmallVector ToForget; auto LoopUsersIt = LoopUsers.find(L); if (LoopUsersIt != LoopUsers.end()) append_range(ToForget, LoopUsersIt->second); forgetMemoizedResults(ToForget); // Invalidate constant-evolved loop header phis. for (PHINode &PN : L->getHeader()->phis()) ConstantEvolutionLoopExitValue.erase(&PN); } // Re-lookup the insert position, since the call to // computeBackedgeTakenCount above could result in a // recusive call to getBackedgeTakenInfo (on a different // loop), which would invalidate the iterator computed // earlier. return BackedgeTakenCounts.find(L)->second = std::move(Result); } void ScalarEvolution::forgetAllLoops() { // This method is intended to forget all info about loops. It should // invalidate caches as if the following happened: // - The trip counts of all loops have changed arbitrarily // - Every llvm::Value has been updated in place to produce a different // result. BackedgeTakenCounts.clear(); PredicatedBackedgeTakenCounts.clear(); BECountUsers.clear(); LoopPropertiesCache.clear(); ConstantEvolutionLoopExitValue.clear(); ValueExprMap.clear(); ValuesAtScopes.clear(); ValuesAtScopesUsers.clear(); LoopDispositions.clear(); BlockDispositions.clear(); UnsignedRanges.clear(); SignedRanges.clear(); ExprValueMap.clear(); HasRecMap.clear(); ConstantMultipleCache.clear(); PredicatedSCEVRewrites.clear(); FoldCache.clear(); FoldCacheUser.clear(); } void ScalarEvolution::visitAndClearUsers( SmallVectorImpl &Worklist, SmallPtrSetImpl &Visited, SmallVectorImpl &ToForget) { while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); if (!isSCEVable(I->getType())) continue; ValueExprMapType::iterator It = ValueExprMap.find_as(static_cast(I)); if (It != ValueExprMap.end()) { eraseValueFromMap(It->first); ToForget.push_back(It->second); if (PHINode *PN = dyn_cast(I)) ConstantEvolutionLoopExitValue.erase(PN); } PushDefUseChildren(I, Worklist, Visited); } } void ScalarEvolution::forgetLoop(const Loop *L) { SmallVector LoopWorklist(1, L); SmallVector Worklist; SmallPtrSet Visited; SmallVector ToForget; // Iterate over all the loops and sub-loops to drop SCEV information. while (!LoopWorklist.empty()) { auto *CurrL = LoopWorklist.pop_back_val(); // Drop any stored trip count value. forgetBackedgeTakenCounts(CurrL, /* Predicated */ false); forgetBackedgeTakenCounts(CurrL, /* Predicated */ true); // Drop information about predicated SCEV rewrites for this loop. for (auto I = PredicatedSCEVRewrites.begin(); I != PredicatedSCEVRewrites.end();) { std::pair Entry = I->first; if (Entry.second == CurrL) PredicatedSCEVRewrites.erase(I++); else ++I; } auto LoopUsersItr = LoopUsers.find(CurrL); if (LoopUsersItr != LoopUsers.end()) { ToForget.insert(ToForget.end(), LoopUsersItr->second.begin(), LoopUsersItr->second.end()); } // Drop information about expressions based on loop-header PHIs. PushLoopPHIs(CurrL, Worklist, Visited); visitAndClearUsers(Worklist, Visited, ToForget); LoopPropertiesCache.erase(CurrL); // Forget all contained loops too, to avoid dangling entries in the // ValuesAtScopes map. LoopWorklist.append(CurrL->begin(), CurrL->end()); } forgetMemoizedResults(ToForget); } void ScalarEvolution::forgetTopmostLoop(const Loop *L) { forgetLoop(L->getOutermostLoop()); } void ScalarEvolution::forgetValue(Value *V) { Instruction *I = dyn_cast(V); if (!I) return; // Drop information about expressions based on loop-header PHIs. SmallVector Worklist; SmallPtrSet Visited; SmallVector ToForget; Worklist.push_back(I); Visited.insert(I); visitAndClearUsers(Worklist, Visited, ToForget); forgetMemoizedResults(ToForget); } void ScalarEvolution::forgetLoopDispositions() { LoopDispositions.clear(); } void ScalarEvolution::forgetBlockAndLoopDispositions(Value *V) { // Unless a specific value is passed to invalidation, completely clear both // caches. if (!V) { BlockDispositions.clear(); LoopDispositions.clear(); return; } if (!isSCEVable(V->getType())) return; const SCEV *S = getExistingSCEV(V); if (!S) return; // Invalidate the block and loop dispositions cached for S. Dispositions of // S's users may change if S's disposition changes (i.e. a user may change to // loop-invariant, if S changes to loop invariant), so also invalidate // dispositions of S's users recursively. SmallVector Worklist = {S}; SmallPtrSet Seen = {S}; while (!Worklist.empty()) { const SCEV *Curr = Worklist.pop_back_val(); bool LoopDispoRemoved = LoopDispositions.erase(Curr); bool BlockDispoRemoved = BlockDispositions.erase(Curr); if (!LoopDispoRemoved && !BlockDispoRemoved) continue; auto Users = SCEVUsers.find(Curr); if (Users != SCEVUsers.end()) for (const auto *User : Users->second) if (Seen.insert(User).second) Worklist.push_back(User); } } /// Get the exact loop backedge taken count considering all loop exits. A /// computable result can only be returned for loops with all exiting blocks /// dominating the latch. howFarToZero assumes that the limit of each loop test /// is never skipped. This is a valid assumption as long as the loop exits via /// that test. For precise results, it is the caller's responsibility to specify /// the relevant loop exiting block using getExact(ExitingBlock, SE). const SCEV * ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE, SmallVector *Preds) const { // If any exits were not computable, the loop is not computable. if (!isComplete() || ExitNotTaken.empty()) return SE->getCouldNotCompute(); const BasicBlock *Latch = L->getLoopLatch(); // All exiting blocks we have collected must dominate the only backedge. if (!Latch) return SE->getCouldNotCompute(); // All exiting blocks we have gathered dominate loop's latch, so exact trip // count is simply a minimum out of all these calculated exit counts. SmallVector Ops; for (const auto &ENT : ExitNotTaken) { const SCEV *BECount = ENT.ExactNotTaken; assert(BECount != SE->getCouldNotCompute() && "Bad exit SCEV!"); assert(SE->DT.dominates(ENT.ExitingBlock, Latch) && "We should only have known counts for exiting blocks that dominate " "latch!"); Ops.push_back(BECount); if (Preds) for (const auto *P : ENT.Predicates) Preds->push_back(P); assert((Preds || ENT.hasAlwaysTruePredicate()) && "Predicate should be always true!"); } // If an earlier exit exits on the first iteration (exit count zero), then // a later poison exit count should not propagate into the result. This are // exactly the semantics provided by umin_seq. return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true); } /// Get the exact not taken count for this loop exit. const SCEV * ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock, ScalarEvolution *SE) const { for (const auto &ENT : ExitNotTaken) if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate()) return ENT.ExactNotTaken; return SE->getCouldNotCompute(); } const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax( const BasicBlock *ExitingBlock, ScalarEvolution *SE) const { for (const auto &ENT : ExitNotTaken) if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate()) return ENT.ConstantMaxNotTaken; return SE->getCouldNotCompute(); } const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax( const BasicBlock *ExitingBlock, ScalarEvolution *SE) const { for (const auto &ENT : ExitNotTaken) if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate()) return ENT.SymbolicMaxNotTaken; return SE->getCouldNotCompute(); } /// getConstantMax - Get the constant max backedge taken count for the loop. const SCEV * ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const { auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) { return !ENT.hasAlwaysTruePredicate(); }; if (!getConstantMax() || any_of(ExitNotTaken, PredicateNotAlwaysTrue)) return SE->getCouldNotCompute(); assert((isa(getConstantMax()) || isa(getConstantMax())) && "No point in having a non-constant max backedge taken count!"); return getConstantMax(); } const SCEV * ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L, ScalarEvolution *SE) { if (!SymbolicMax) SymbolicMax = SE->computeSymbolicMaxBackedgeTakenCount(L); return SymbolicMax; } bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero( ScalarEvolution *SE) const { auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) { return !ENT.hasAlwaysTruePredicate(); }; return MaxOrZero && !any_of(ExitNotTaken, PredicateNotAlwaysTrue); } ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E) : ExitLimit(E, E, E, false, std::nullopt) {} ScalarEvolution::ExitLimit::ExitLimit( const SCEV *E, const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken, bool MaxOrZero, ArrayRef *> PredSetList) : ExactNotTaken(E), ConstantMaxNotTaken(ConstantMaxNotTaken), SymbolicMaxNotTaken(SymbolicMaxNotTaken), MaxOrZero(MaxOrZero) { // If we prove the max count is zero, so is the symbolic bound. This happens // in practice due to differences in a) how context sensitive we've chosen // to be and b) how we reason about bounds implied by UB. if (ConstantMaxNotTaken->isZero()) { this->ExactNotTaken = E = ConstantMaxNotTaken; this->SymbolicMaxNotTaken = SymbolicMaxNotTaken = ConstantMaxNotTaken; } assert((isa(ExactNotTaken) || !isa(ConstantMaxNotTaken)) && "Exact is not allowed to be less precise than Constant Max"); assert((isa(ExactNotTaken) || !isa(SymbolicMaxNotTaken)) && "Exact is not allowed to be less precise than Symbolic Max"); assert((isa(SymbolicMaxNotTaken) || !isa(ConstantMaxNotTaken)) && "Symbolic Max is not allowed to be less precise than Constant Max"); assert((isa(ConstantMaxNotTaken) || isa(ConstantMaxNotTaken)) && "No point in having a non-constant max backedge taken count!"); for (const auto *PredSet : PredSetList) for (const auto *P : *PredSet) addPredicate(P); assert((isa(E) || !E->getType()->isPointerTy()) && "Backedge count should be int"); assert((isa(ConstantMaxNotTaken) || !ConstantMaxNotTaken->getType()->isPointerTy()) && "Max backedge count should be int"); } ScalarEvolution::ExitLimit::ExitLimit( const SCEV *E, const SCEV *ConstantMaxNotTaken, const SCEV *SymbolicMaxNotTaken, bool MaxOrZero, const SmallPtrSetImpl &PredSet) : ExitLimit(E, ConstantMaxNotTaken, SymbolicMaxNotTaken, MaxOrZero, { &PredSet }) {} /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each /// computable exit into a persistent ExitNotTakenInfo array. ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo( ArrayRef ExitCounts, bool IsComplete, const SCEV *ConstantMax, bool MaxOrZero) : ConstantMax(ConstantMax), IsComplete(IsComplete), MaxOrZero(MaxOrZero) { using EdgeExitInfo = ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo; ExitNotTaken.reserve(ExitCounts.size()); std::transform(ExitCounts.begin(), ExitCounts.end(), std::back_inserter(ExitNotTaken), [&](const EdgeExitInfo &EEI) { BasicBlock *ExitBB = EEI.first; const ExitLimit &EL = EEI.second; return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, EL.ConstantMaxNotTaken, EL.SymbolicMaxNotTaken, EL.Predicates); }); assert((isa(ConstantMax) || isa(ConstantMax)) && "No point in having a non-constant max backedge taken count!"); } /// Compute the number of times the backedge of the specified loop will execute. ScalarEvolution::BackedgeTakenInfo ScalarEvolution::computeBackedgeTakenCount(const Loop *L, bool AllowPredicates) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); using EdgeExitInfo = ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo; SmallVector ExitCounts; bool CouldComputeBECount = true; BasicBlock *Latch = L->getLoopLatch(); // may be NULL. const SCEV *MustExitMaxBECount = nullptr; const SCEV *MayExitMaxBECount = nullptr; bool MustExitMaxOrZero = false; // Compute the ExitLimit for each loop exit. Use this to populate ExitCounts // and compute maxBECount. // Do a union of all the predicates here. for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { BasicBlock *ExitBB = ExitingBlocks[i]; // We canonicalize untaken exits to br (constant), ignore them so that // proving an exit untaken doesn't negatively impact our ability to reason // about the loop as whole. if (auto *BI = dyn_cast(ExitBB->getTerminator())) if (auto *CI = dyn_cast(BI->getCondition())) { bool ExitIfTrue = !L->contains(BI->getSuccessor(0)); if (ExitIfTrue == CI->isZero()) continue; } ExitLimit EL = computeExitLimit(L, ExitBB, AllowPredicates); assert((AllowPredicates || EL.Predicates.empty()) && "Predicated exit limit when predicates are not allowed!"); // 1. For each exit that can be computed, add an entry to ExitCounts. // CouldComputeBECount is true only if all exits can be computed. if (EL.ExactNotTaken != getCouldNotCompute()) ++NumExitCountsComputed; else // We couldn't compute an exact value for this exit, so // we won't be able to compute an exact value for the loop. CouldComputeBECount = false; // Remember exit count if either exact or symbolic is known. Because // Exact always implies symbolic, only check symbolic. if (EL.SymbolicMaxNotTaken != getCouldNotCompute()) ExitCounts.emplace_back(ExitBB, EL); else { assert(EL.ExactNotTaken == getCouldNotCompute() && "Exact is known but symbolic isn't?"); ++NumExitCountsNotComputed; } // 2. Derive the loop's MaxBECount from each exit's max number of // non-exiting iterations. Partition the loop exits into two kinds: // LoopMustExits and LoopMayExits. // // If the exit dominates the loop latch, it is a LoopMustExit otherwise it // is a LoopMayExit. If any computable LoopMustExit is found, then // MaxBECount is the minimum EL.ConstantMaxNotTaken of computable // LoopMustExits. Otherwise, MaxBECount is conservatively the maximum // EL.ConstantMaxNotTaken, where CouldNotCompute is considered greater than // any // computable EL.ConstantMaxNotTaken. if (EL.ConstantMaxNotTaken != getCouldNotCompute() && Latch && DT.dominates(ExitBB, Latch)) { if (!MustExitMaxBECount) { MustExitMaxBECount = EL.ConstantMaxNotTaken; MustExitMaxOrZero = EL.MaxOrZero; } else { MustExitMaxBECount = getUMinFromMismatchedTypes(MustExitMaxBECount, EL.ConstantMaxNotTaken); } } else if (MayExitMaxBECount != getCouldNotCompute()) { if (!MayExitMaxBECount || EL.ConstantMaxNotTaken == getCouldNotCompute()) MayExitMaxBECount = EL.ConstantMaxNotTaken; else { MayExitMaxBECount = getUMaxFromMismatchedTypes(MayExitMaxBECount, EL.ConstantMaxNotTaken); } } } const SCEV *MaxBECount = MustExitMaxBECount ? MustExitMaxBECount : (MayExitMaxBECount ? MayExitMaxBECount : getCouldNotCompute()); // The loop backedge will be taken the maximum or zero times if there's // a single exit that must be taken the maximum or zero times. bool MaxOrZero = (MustExitMaxOrZero && ExitingBlocks.size() == 1); // Remember which SCEVs are used in exit limits for invalidation purposes. // We only care about non-constant SCEVs here, so we can ignore // EL.ConstantMaxNotTaken // and MaxBECount, which must be SCEVConstant. for (const auto &Pair : ExitCounts) { if (!isa(Pair.second.ExactNotTaken)) BECountUsers[Pair.second.ExactNotTaken].insert({L, AllowPredicates}); if (!isa(Pair.second.SymbolicMaxNotTaken)) BECountUsers[Pair.second.SymbolicMaxNotTaken].insert( {L, AllowPredicates}); } return BackedgeTakenInfo(std::move(ExitCounts), CouldComputeBECount, MaxBECount, MaxOrZero); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock, bool AllowPredicates) { assert(L->contains(ExitingBlock) && "Exit count for non-loop block?"); // If our exiting block does not dominate the latch, then its connection with // loop's exit limit may be far from trivial. const BasicBlock *Latch = L->getLoopLatch(); if (!Latch || !DT.dominates(ExitingBlock, Latch)) return getCouldNotCompute(); bool IsOnlyExit = (L->getExitingBlock() != nullptr); Instruction *Term = ExitingBlock->getTerminator(); if (BranchInst *BI = dyn_cast(Term)) { assert(BI->isConditional() && "If unconditional, it can't be in loop!"); bool ExitIfTrue = !L->contains(BI->getSuccessor(0)); assert(ExitIfTrue == L->contains(BI->getSuccessor(1)) && "It should have one successor in loop and one exit block!"); // Proceed to the next level to examine the exit condition expression. return computeExitLimitFromCond(L, BI->getCondition(), ExitIfTrue, /*ControlsOnlyExit=*/IsOnlyExit, AllowPredicates); } if (SwitchInst *SI = dyn_cast(Term)) { // For switch, make sure that there is a single exit from the loop. BasicBlock *Exit = nullptr; for (auto *SBB : successors(ExitingBlock)) if (!L->contains(SBB)) { if (Exit) // Multiple exit successors. return getCouldNotCompute(); Exit = SBB; } assert(Exit && "Exiting block must have at least one exit"); return computeExitLimitFromSingleExitSwitch( L, SI, Exit, /*ControlsOnlyExit=*/IsOnlyExit); } return getCouldNotCompute(); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCond( const Loop *L, Value *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates) { ScalarEvolution::ExitLimitCacheTy Cache(L, ExitIfTrue, AllowPredicates); return computeExitLimitFromCondCached(Cache, L, ExitCond, ExitIfTrue, ControlsOnlyExit, AllowPredicates); } std::optional ScalarEvolution::ExitLimitCache::find(const Loop *L, Value *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates) { (void)this->L; (void)this->ExitIfTrue; (void)this->AllowPredicates; assert(this->L == L && this->ExitIfTrue == ExitIfTrue && this->AllowPredicates == AllowPredicates && "Variance in assumed invariant key components!"); auto Itr = TripCountMap.find({ExitCond, ControlsOnlyExit}); if (Itr == TripCountMap.end()) return std::nullopt; return Itr->second; } void ScalarEvolution::ExitLimitCache::insert(const Loop *L, Value *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates, const ExitLimit &EL) { assert(this->L == L && this->ExitIfTrue == ExitIfTrue && this->AllowPredicates == AllowPredicates && "Variance in assumed invariant key components!"); auto InsertResult = TripCountMap.insert({{ExitCond, ControlsOnlyExit}, EL}); assert(InsertResult.second && "Expected successful insertion!"); (void)InsertResult; (void)ExitIfTrue; } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondCached( ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates) { if (auto MaybeEL = Cache.find(L, ExitCond, ExitIfTrue, ControlsOnlyExit, AllowPredicates)) return *MaybeEL; ExitLimit EL = computeExitLimitFromCondImpl( Cache, L, ExitCond, ExitIfTrue, ControlsOnlyExit, AllowPredicates); Cache.insert(L, ExitCond, ExitIfTrue, ControlsOnlyExit, AllowPredicates, EL); return EL; } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl( ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates) { // Handle BinOp conditions (And, Or). if (auto LimitFromBinOp = computeExitLimitFromCondFromBinOp( Cache, L, ExitCond, ExitIfTrue, ControlsOnlyExit, AllowPredicates)) return *LimitFromBinOp; // With an icmp, it may be feasible to compute an exact backedge-taken count. // Proceed to the next level to examine the icmp. if (ICmpInst *ExitCondICmp = dyn_cast(ExitCond)) { ExitLimit EL = computeExitLimitFromICmp(L, ExitCondICmp, ExitIfTrue, ControlsOnlyExit); if (EL.hasFullInfo() || !AllowPredicates) return EL; // Try again, but use SCEV predicates this time. return computeExitLimitFromICmp(L, ExitCondICmp, ExitIfTrue, ControlsOnlyExit, /*AllowPredicates=*/true); } // Check for a constant condition. These are normally stripped out by // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to // preserve the CFG and is temporarily leaving constant conditions // in place. if (ConstantInt *CI = dyn_cast(ExitCond)) { if (ExitIfTrue == !CI->getZExtValue()) // The backedge is always taken. return getCouldNotCompute(); // The backedge is never taken. return getZero(CI->getType()); } // If we're exiting based on the overflow flag of an x.with.overflow intrinsic // with a constant step, we can form an equivalent icmp predicate and figure // out how many iterations will be taken before we exit. const WithOverflowInst *WO; const APInt *C; if (match(ExitCond, m_ExtractValue<1>(m_WithOverflowInst(WO))) && match(WO->getRHS(), m_APInt(C))) { ConstantRange NWR = ConstantRange::makeExactNoWrapRegion(WO->getBinaryOp(), *C, WO->getNoWrapKind()); CmpInst::Predicate Pred; APInt NewRHSC, Offset; NWR.getEquivalentICmp(Pred, NewRHSC, Offset); if (!ExitIfTrue) Pred = ICmpInst::getInversePredicate(Pred); auto *LHS = getSCEV(WO->getLHS()); if (Offset != 0) LHS = getAddExpr(LHS, getConstant(Offset)); auto EL = computeExitLimitFromICmp(L, Pred, LHS, getConstant(NewRHSC), ControlsOnlyExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; } // If it's not an integer or pointer comparison then compute it the hard way. return computeExitCountExhaustively(L, ExitCond, ExitIfTrue); } std::optional ScalarEvolution::computeExitLimitFromCondFromBinOp( ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates) { // Check if the controlling expression for this loop is an And or Or. Value *Op0, *Op1; bool IsAnd = false; if (match(ExitCond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) IsAnd = true; else if (match(ExitCond, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) IsAnd = false; else return std::nullopt; // EitherMayExit is true in these two cases: // br (and Op0 Op1), loop, exit // br (or Op0 Op1), exit, loop bool EitherMayExit = IsAnd ^ ExitIfTrue; ExitLimit EL0 = computeExitLimitFromCondCached( Cache, L, Op0, ExitIfTrue, ControlsOnlyExit && !EitherMayExit, AllowPredicates); ExitLimit EL1 = computeExitLimitFromCondCached( Cache, L, Op1, ExitIfTrue, ControlsOnlyExit && !EitherMayExit, AllowPredicates); // Be robust against unsimplified IR for the form "op i1 X, NeutralElement" const Constant *NeutralElement = ConstantInt::get(ExitCond->getType(), IsAnd); if (isa(Op1)) return Op1 == NeutralElement ? EL0 : EL1; if (isa(Op0)) return Op0 == NeutralElement ? EL1 : EL0; const SCEV *BECount = getCouldNotCompute(); const SCEV *ConstantMaxBECount = getCouldNotCompute(); const SCEV *SymbolicMaxBECount = getCouldNotCompute(); if (EitherMayExit) { bool UseSequentialUMin = !isa(ExitCond); // Both conditions must be same for the loop to continue executing. // Choose the less conservative count. if (EL0.ExactNotTaken != getCouldNotCompute() && EL1.ExactNotTaken != getCouldNotCompute()) { BECount = getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken, UseSequentialUMin); } if (EL0.ConstantMaxNotTaken == getCouldNotCompute()) ConstantMaxBECount = EL1.ConstantMaxNotTaken; else if (EL1.ConstantMaxNotTaken == getCouldNotCompute()) ConstantMaxBECount = EL0.ConstantMaxNotTaken; else ConstantMaxBECount = getUMinFromMismatchedTypes(EL0.ConstantMaxNotTaken, EL1.ConstantMaxNotTaken); if (EL0.SymbolicMaxNotTaken == getCouldNotCompute()) SymbolicMaxBECount = EL1.SymbolicMaxNotTaken; else if (EL1.SymbolicMaxNotTaken == getCouldNotCompute()) SymbolicMaxBECount = EL0.SymbolicMaxNotTaken; else SymbolicMaxBECount = getUMinFromMismatchedTypes( EL0.SymbolicMaxNotTaken, EL1.SymbolicMaxNotTaken, UseSequentialUMin); } else { // Both conditions must be same at the same time for the loop to exit. // For now, be conservative. if (EL0.ExactNotTaken == EL1.ExactNotTaken) BECount = EL0.ExactNotTaken; } // There are cases (e.g. PR26207) where computeExitLimitFromCond is able // to be more aggressive when computing BECount than when computing // ConstantMaxBECount. In these cases it is possible for EL0.ExactNotTaken // and // EL1.ExactNotTaken to match, but for EL0.ConstantMaxNotTaken and // EL1.ConstantMaxNotTaken to not. if (isa(ConstantMaxBECount) && !isa(BECount)) ConstantMaxBECount = getConstant(getUnsignedRangeMax(BECount)); if (isa(SymbolicMaxBECount)) SymbolicMaxBECount = isa(BECount) ? ConstantMaxBECount : BECount; return ExitLimit(BECount, ConstantMaxBECount, SymbolicMaxBECount, false, { &EL0.Predicates, &EL1.Predicates }); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp( const Loop *L, ICmpInst *ExitCond, bool ExitIfTrue, bool ControlsOnlyExit, bool AllowPredicates) { // If the condition was exit on true, convert the condition to exit on false ICmpInst::Predicate Pred; if (!ExitIfTrue) Pred = ExitCond->getPredicate(); else Pred = ExitCond->getInversePredicate(); const ICmpInst::Predicate OriginalPred = Pred; const SCEV *LHS = getSCEV(ExitCond->getOperand(0)); const SCEV *RHS = getSCEV(ExitCond->getOperand(1)); ExitLimit EL = computeExitLimitFromICmp(L, Pred, LHS, RHS, ControlsOnlyExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; auto *ExhaustiveCount = computeExitCountExhaustively(L, ExitCond, ExitIfTrue); if (!isa(ExhaustiveCount)) return ExhaustiveCount; return computeShiftCompareExitLimit(ExitCond->getOperand(0), ExitCond->getOperand(1), L, OriginalPred); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp( const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, bool ControlsOnlyExit, bool AllowPredicates) { // Try to evaluate any dependencies out of the loop. LHS = getSCEVAtScope(LHS, L); RHS = getSCEVAtScope(RHS, L); // At this point, we would like to compute how many iterations of the // loop the predicate will return true for these inputs. if (isLoopInvariant(LHS, L) && !isLoopInvariant(RHS, L)) { // If there is a loop-invariant, force it into the RHS. std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); } bool ControllingFiniteLoop = ControlsOnlyExit && loopHasNoAbnormalExits(L) && loopIsFiniteByAssumption(L); // Simplify the operands before analyzing them. (void)SimplifyICmpOperands(Pred, LHS, RHS, /*Depth=*/0); // If we have a comparison of a chrec against a constant, try to use value // ranges to answer this query. if (const SCEVConstant *RHSC = dyn_cast(RHS)) if (const SCEVAddRecExpr *AddRec = dyn_cast(LHS)) if (AddRec->getLoop() == L) { // Form the constant range. ConstantRange CompRange = ConstantRange::makeExactICmpRegion(Pred, RHSC->getAPInt()); const SCEV *Ret = AddRec->getNumIterationsInRange(CompRange, *this); if (!isa(Ret)) return Ret; } // If this loop must exit based on this condition (or execute undefined // behaviour), and we can prove the test sequence produced must repeat // the same values on self-wrap of the IV, then we can infer that IV // doesn't self wrap because if it did, we'd have an infinite (undefined) // loop. if (ControllingFiniteLoop && isLoopInvariant(RHS, L)) { // TODO: We can peel off any functions which are invertible *in L*. Loop // invariant terms are effectively constants for our purposes here. auto *InnerLHS = LHS; if (auto *ZExt = dyn_cast(LHS)) InnerLHS = ZExt->getOperand(); if (const SCEVAddRecExpr *AR = dyn_cast(InnerLHS)) { auto *StrideC = dyn_cast(AR->getStepRecurrence(*this)); if (!AR->hasNoSelfWrap() && AR->getLoop() == L && AR->isAffine() && StrideC && StrideC->getAPInt().isPowerOf2()) { auto Flags = AR->getNoWrapFlags(); Flags = setFlags(Flags, SCEV::FlagNW); SmallVector Operands{AR->operands()}; Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags); setNoWrapFlags(const_cast(AR), Flags); } } } switch (Pred) { case ICmpInst::ICMP_NE: { // while (X != Y) // Convert to: while (X-Y != 0) if (LHS->getType()->isPointerTy()) { LHS = getLosslessPtrToIntExpr(LHS); if (isa(LHS)) return LHS; } if (RHS->getType()->isPointerTy()) { RHS = getLosslessPtrToIntExpr(RHS); if (isa(RHS)) return RHS; } ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsOnlyExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_EQ: { // while (X == Y) // Convert to: while (X-Y == 0) if (LHS->getType()->isPointerTy()) { LHS = getLosslessPtrToIntExpr(LHS); if (isa(LHS)) return LHS; } if (RHS->getType()->isPointerTy()) { RHS = getLosslessPtrToIntExpr(RHS); if (isa(RHS)) return RHS; } ExitLimit EL = howFarToNonZero(getMinusSCEV(LHS, RHS), L); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_ULE: // Since the loop is finite, an invariant RHS cannot include the boundary // value, otherwise it would loop forever. if (!EnableFiniteLoopControl || !ControllingFiniteLoop || !isLoopInvariant(RHS, L)) break; RHS = getAddExpr(getOne(RHS->getType()), RHS); [[fallthrough]]; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_ULT: { // while (X < Y) bool IsSigned = ICmpInst::isSigned(Pred); ExitLimit EL = howManyLessThans(LHS, RHS, L, IsSigned, ControlsOnlyExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_SGE: case ICmpInst::ICMP_UGE: // Since the loop is finite, an invariant RHS cannot include the boundary // value, otherwise it would loop forever. if (!EnableFiniteLoopControl || !ControllingFiniteLoop || !isLoopInvariant(RHS, L)) break; RHS = getAddExpr(getMinusOne(RHS->getType()), RHS); [[fallthrough]]; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_UGT: { // while (X > Y) bool IsSigned = ICmpInst::isSigned(Pred); ExitLimit EL = howManyGreaterThans(LHS, RHS, L, IsSigned, ControlsOnlyExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; break; } default: break; } return getCouldNotCompute(); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromSingleExitSwitch(const Loop *L, SwitchInst *Switch, BasicBlock *ExitingBlock, bool ControlsOnlyExit) { assert(!L->contains(ExitingBlock) && "Not an exiting block!"); // Give up if the exit is the default dest of a switch. if (Switch->getDefaultDest() == ExitingBlock) return getCouldNotCompute(); assert(L->contains(Switch->getDefaultDest()) && "Default case must not exit the loop!"); const SCEV *LHS = getSCEVAtScope(Switch->getCondition(), L); const SCEV *RHS = getConstant(Switch->findCaseDest(ExitingBlock)); // while (X != Y) --> while (X-Y != 0) ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsOnlyExit); if (EL.hasAnyInfo()) return EL; return getCouldNotCompute(); } static ConstantInt * EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C, ScalarEvolution &SE) { const SCEV *InVal = SE.getConstant(C); const SCEV *Val = AddRec->evaluateAtIteration(InVal, SE); assert(isa(Val) && "Evaluation of SCEV at constant didn't fold correctly?"); return cast(Val)->getValue(); } ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit( Value *LHS, Value *RHSV, const Loop *L, ICmpInst::Predicate Pred) { ConstantInt *RHS = dyn_cast(RHSV); if (!RHS) return getCouldNotCompute(); const BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return getCouldNotCompute(); const BasicBlock *Predecessor = L->getLoopPredecessor(); if (!Predecessor) return getCouldNotCompute(); // Return true if V is of the form "LHS `shift_op` ". // Return LHS in OutLHS and shift_opt in OutOpCode. auto MatchPositiveShift = [](Value *V, Value *&OutLHS, Instruction::BinaryOps &OutOpCode) { using namespace PatternMatch; ConstantInt *ShiftAmt; if (match(V, m_LShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) OutOpCode = Instruction::LShr; else if (match(V, m_AShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) OutOpCode = Instruction::AShr; else if (match(V, m_Shl(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) OutOpCode = Instruction::Shl; else return false; return ShiftAmt->getValue().isStrictlyPositive(); }; // Recognize a "shift recurrence" either of the form %iv or of %iv.shifted in // // loop: // %iv = phi i32 [ %iv.shifted, %loop ], [ %val, %preheader ] // %iv.shifted = lshr i32 %iv, // // Return true on a successful match. Return the corresponding PHI node (%iv // above) in PNOut and the opcode of the shift operation in OpCodeOut. auto MatchShiftRecurrence = [&](Value *V, PHINode *&PNOut, Instruction::BinaryOps &OpCodeOut) { std::optional PostShiftOpCode; { Instruction::BinaryOps OpC; Value *V; // If we encounter a shift instruction, "peel off" the shift operation, // and remember that we did so. Later when we inspect %iv's backedge // value, we will make sure that the backedge value uses the same // operation. // // Note: the peeled shift operation does not have to be the same // instruction as the one feeding into the PHI's backedge value. We only // really care about it being the same *kind* of shift instruction -- // that's all that is required for our later inferences to hold. if (MatchPositiveShift(LHS, V, OpC)) { PostShiftOpCode = OpC; LHS = V; } } PNOut = dyn_cast(LHS); if (!PNOut || PNOut->getParent() != L->getHeader()) return false; Value *BEValue = PNOut->getIncomingValueForBlock(Latch); Value *OpLHS; return // The backedge value for the PHI node must be a shift by a positive // amount MatchPositiveShift(BEValue, OpLHS, OpCodeOut) && // of the PHI node itself OpLHS == PNOut && // and the kind of shift should be match the kind of shift we peeled // off, if any. (!PostShiftOpCode || *PostShiftOpCode == OpCodeOut); }; PHINode *PN; Instruction::BinaryOps OpCode; if (!MatchShiftRecurrence(LHS, PN, OpCode)) return getCouldNotCompute(); const DataLayout &DL = getDataLayout(); // The key rationale for this optimization is that for some kinds of shift // recurrences, the value of the recurrence "stabilizes" to either 0 or -1 // within a finite number of iterations. If the condition guarding the // backedge (in the sense that the backedge is taken if the condition is true) // is false for the value the shift recurrence stabilizes to, then we know // that the backedge is taken only a finite number of times. ConstantInt *StableValue = nullptr; switch (OpCode) { default: llvm_unreachable("Impossible case!"); case Instruction::AShr: { // {K,ashr,} stabilizes to signum(K) in at most // bitwidth(K) iterations. Value *FirstValue = PN->getIncomingValueForBlock(Predecessor); KnownBits Known = computeKnownBits(FirstValue, DL, 0, &AC, Predecessor->getTerminator(), &DT); auto *Ty = cast(RHS->getType()); if (Known.isNonNegative()) StableValue = ConstantInt::get(Ty, 0); else if (Known.isNegative()) StableValue = ConstantInt::get(Ty, -1, true); else return getCouldNotCompute(); break; } case Instruction::LShr: case Instruction::Shl: // Both {K,lshr,} and {K,shl,} // stabilize to 0 in at most bitwidth(K) iterations. StableValue = ConstantInt::get(cast(RHS->getType()), 0); break; } auto *Result = ConstantFoldCompareInstOperands(Pred, StableValue, RHS, DL, &TLI); assert(Result->getType()->isIntegerTy(1) && "Otherwise cannot be an operand to a branch instruction"); if (Result->isZeroValue()) { unsigned BitWidth = getTypeSizeInBits(RHS->getType()); const SCEV *UpperBound = getConstant(getEffectiveSCEVType(RHS->getType()), BitWidth); return ExitLimit(getCouldNotCompute(), UpperBound, UpperBound, false); } return getCouldNotCompute(); } /// Return true if we can constant fold an instruction of the specified type, /// assuming that all operands were constants. static bool CanConstantFold(const Instruction *I) { if (isa(I) || isa(I) || isa(I) || isa(I) || isa(I) || isa(I) || isa(I)) return true; if (const CallInst *CI = dyn_cast(I)) if (const Function *F = CI->getCalledFunction()) return canConstantFoldCallTo(CI, F); return false; } /// Determine whether this instruction can constant evolve within this loop /// assuming its operands can all constant evolve. static bool canConstantEvolve(Instruction *I, const Loop *L) { // An instruction outside of the loop can't be derived from a loop PHI. if (!L->contains(I)) return false; if (isa(I)) { // We don't currently keep track of the control flow needed to evaluate // PHIs, so we cannot handle PHIs inside of loops. return L->getHeader() == I->getParent(); } // If we won't be able to constant fold this expression even if the operands // are constants, bail early. return CanConstantFold(I); } /// getConstantEvolvingPHIOperands - Implement getConstantEvolvingPHI by /// recursing through each instruction operand until reaching a loop header phi. static PHINode * getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L, DenseMap &PHIMap, unsigned Depth) { if (Depth > MaxConstantEvolvingDepth) return nullptr; // Otherwise, we can evaluate this instruction if all of its operands are // constant or derived from a PHI node themselves. PHINode *PHI = nullptr; for (Value *Op : UseInst->operands()) { if (isa(Op)) continue; Instruction *OpInst = dyn_cast(Op); if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr; PHINode *P = dyn_cast(OpInst); if (!P) // If this operand is already visited, reuse the prior result. // We may have P != PHI if this is the deepest point at which the // inconsistent paths meet. P = PHIMap.lookup(OpInst); if (!P) { // Recurse and memoize the results, whether a phi is found or not. // This recursive call invalidates pointers into PHIMap. P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap, Depth + 1); PHIMap[OpInst] = P; } if (!P) return nullptr; // Not evolving from PHI if (PHI && PHI != P) return nullptr; // Evolving from multiple different PHIs. PHI = P; } // This is a expression evolving from a constant PHI! return PHI; } /// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node /// in the loop that V is derived from. We allow arbitrary operations along the /// way, but the operands of an operation must either be constants or a value /// derived from a constant PHI. If this expression does not fit with these /// constraints, return null. static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) { Instruction *I = dyn_cast(V); if (!I || !canConstantEvolve(I, L)) return nullptr; if (PHINode *PN = dyn_cast(I)) return PN; // Record non-constant instructions contained by the loop. DenseMap PHIMap; return getConstantEvolvingPHIOperands(I, L, PHIMap, 0); } /// EvaluateExpression - Given an expression that passes the /// getConstantEvolvingPHI predicate, evaluate its value assuming the PHI node /// in the loop has the value PHIVal. If we can't fold this expression for some /// reason, return null. static Constant *EvaluateExpression(Value *V, const Loop *L, DenseMap &Vals, const DataLayout &DL, const TargetLibraryInfo *TLI) { // Convenient constant check, but redundant for recursive calls. if (Constant *C = dyn_cast(V)) return C; Instruction *I = dyn_cast(V); if (!I) return nullptr; if (Constant *C = Vals.lookup(I)) return C; // An instruction inside the loop depends on a value outside the loop that we // weren't given a mapping for, or a value such as a call inside the loop. if (!canConstantEvolve(I, L)) return nullptr; // An unmapped PHI can be due to a branch or another loop inside this loop, // or due to this not being the initial iteration through a loop where we // couldn't compute the evolution of this particular PHI last time. if (isa(I)) return nullptr; std::vector Operands(I->getNumOperands()); for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Instruction *Operand = dyn_cast(I->getOperand(i)); if (!Operand) { Operands[i] = dyn_cast(I->getOperand(i)); if (!Operands[i]) return nullptr; continue; } Constant *C = EvaluateExpression(Operand, L, Vals, DL, TLI); Vals[Operand] = C; if (!C) return nullptr; Operands[i] = C; } return ConstantFoldInstOperands(I, Operands, DL, TLI); } // If every incoming value to PN except the one for BB is a specific Constant, // return that, else return nullptr. static Constant *getOtherIncomingValue(PHINode *PN, BasicBlock *BB) { Constant *IncomingVal = nullptr; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { if (PN->getIncomingBlock(i) == BB) continue; auto *CurrentVal = dyn_cast(PN->getIncomingValue(i)); if (!CurrentVal) return nullptr; if (IncomingVal != CurrentVal) { if (IncomingVal) return nullptr; IncomingVal = CurrentVal; } } return IncomingVal; } /// getConstantEvolutionLoopExitValue - If we know that the specified Phi is /// in the header of its containing loop, we know the loop executes a /// constant number of times, and the PHI node is just a recurrence /// involving constants, fold it. Constant * ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, const APInt &BEs, const Loop *L) { auto I = ConstantEvolutionLoopExitValue.find(PN); if (I != ConstantEvolutionLoopExitValue.end()) return I->second; if (BEs.ugt(MaxBruteForceIterations)) return ConstantEvolutionLoopExitValue[PN] = nullptr; // Not going to evaluate it. Constant *&RetVal = ConstantEvolutionLoopExitValue[PN]; DenseMap CurrentIterVals; BasicBlock *Header = L->getHeader(); assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!"); BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return nullptr; for (PHINode &PHI : Header->phis()) { if (auto *StartCST = getOtherIncomingValue(&PHI, Latch)) CurrentIterVals[&PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return RetVal = nullptr; Value *BEValue = PN->getIncomingValueForBlock(Latch); // Execute the loop symbolically to determine the exit value. assert(BEs.getActiveBits() < CHAR_BIT * sizeof(unsigned) && "BEs is <= MaxBruteForceIterations which is an 'unsigned'!"); unsigned NumIterations = BEs.getZExtValue(); // must be in range unsigned IterationNum = 0; const DataLayout &DL = getDataLayout(); for (; ; ++IterationNum) { if (IterationNum == NumIterations) return RetVal = CurrentIterVals[PN]; // Got exit value! // Compute the value of the PHIs for the next iteration. // EvaluateExpression adds non-phi values to the CurrentIterVals map. DenseMap NextIterVals; Constant *NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); if (!NextPHI) return nullptr; // Couldn't evaluate! NextIterVals[PN] = NextPHI; bool StoppedEvolving = NextPHI == CurrentIterVals[PN]; // Also evaluate the other PHI nodes. However, we don't get to stop if we // cease to be able to evaluate one of them or if they stop evolving, // because that doesn't necessarily prevent us from computing PN. SmallVector, 8> PHIsToCompute; for (const auto &I : CurrentIterVals) { PHINode *PHI = dyn_cast(I.first); if (!PHI || PHI == PN || PHI->getParent() != Header) continue; PHIsToCompute.emplace_back(PHI, I.second); } // We use two distinct loops because EvaluateExpression may invalidate any // iterators into CurrentIterVals. for (const auto &I : PHIsToCompute) { PHINode *PHI = I.first; Constant *&NextPHI = NextIterVals[PHI]; if (!NextPHI) { // Not already computed. Value *BEValue = PHI->getIncomingValueForBlock(Latch); NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); } if (NextPHI != I.second) StoppedEvolving = false; } // If all entries in CurrentIterVals == NextIterVals then we can stop // iterating, the loop can't continue to change. if (StoppedEvolving) return RetVal = CurrentIterVals[PN]; CurrentIterVals.swap(NextIterVals); } } const SCEV *ScalarEvolution::computeExitCountExhaustively(const Loop *L, Value *Cond, bool ExitWhen) { PHINode *PN = getConstantEvolvingPHI(Cond, L); if (!PN) return getCouldNotCompute(); // If the loop is canonicalized, the PHI will have exactly two entries. // That's the only form we support here. if (PN->getNumIncomingValues() != 2) return getCouldNotCompute(); DenseMap CurrentIterVals; BasicBlock *Header = L->getHeader(); assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!"); BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Should follow from NumIncomingValues == 2!"); for (PHINode &PHI : Header->phis()) { if (auto *StartCST = getOtherIncomingValue(&PHI, Latch)) CurrentIterVals[&PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return getCouldNotCompute(); // Okay, we find a PHI node that defines the trip count of this loop. Execute // the loop symbolically to determine when the condition gets a value of // "ExitWhen". unsigned MaxIterations = MaxBruteForceIterations; // Limit analysis. const DataLayout &DL = getDataLayout(); for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){ auto *CondVal = dyn_cast_or_null( EvaluateExpression(Cond, L, CurrentIterVals, DL, &TLI)); // Couldn't symbolically evaluate. if (!CondVal) return getCouldNotCompute(); if (CondVal->getValue() == uint64_t(ExitWhen)) { ++NumBruteForceTripCountsComputed; return getConstant(Type::getInt32Ty(getContext()), IterationNum); } // Update all the PHI nodes for the next iteration. DenseMap NextIterVals; // Create a list of which PHIs we need to compute. We want to do this before // calling EvaluateExpression on them because that may invalidate iterators // into CurrentIterVals. SmallVector PHIsToCompute; for (const auto &I : CurrentIterVals) { PHINode *PHI = dyn_cast(I.first); if (!PHI || PHI->getParent() != Header) continue; PHIsToCompute.push_back(PHI); } for (PHINode *PHI : PHIsToCompute) { Constant *&NextPHI = NextIterVals[PHI]; if (NextPHI) continue; // Already computed! Value *BEValue = PHI->getIncomingValueForBlock(Latch); NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); } CurrentIterVals.swap(NextIterVals); } // Too many iterations were needed to evaluate. return getCouldNotCompute(); } const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) { SmallVector, 2> &Values = ValuesAtScopes[V]; // Check to see if we've folded this expression at this loop before. for (auto &LS : Values) if (LS.first == L) return LS.second ? LS.second : V; Values.emplace_back(L, nullptr); // Otherwise compute it. const SCEV *C = computeSCEVAtScope(V, L); for (auto &LS : reverse(ValuesAtScopes[V])) if (LS.first == L) { LS.second = C; if (!isa(C)) ValuesAtScopesUsers[C].push_back({L, V}); break; } return C; } /// This builds up a Constant using the ConstantExpr interface. That way, we /// will return Constants for objects which aren't represented by a /// SCEVConstant, because SCEVConstant is restricted to ConstantInt. /// Returns NULL if the SCEV isn't representable as a Constant. static Constant *BuildConstantFromSCEV(const SCEV *V) { switch (V->getSCEVType()) { case scCouldNotCompute: case scAddRecExpr: case scVScale: return nullptr; case scConstant: return cast(V)->getValue(); case scUnknown: return dyn_cast(cast(V)->getValue()); case scSignExtend: { const SCEVSignExtendExpr *SS = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(SS->getOperand())) return ConstantExpr::getSExt(CastOp, SS->getType()); return nullptr; } case scZeroExtend: { const SCEVZeroExtendExpr *SZ = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(SZ->getOperand())) return ConstantExpr::getZExt(CastOp, SZ->getType()); return nullptr; } case scPtrToInt: { const SCEVPtrToIntExpr *P2I = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(P2I->getOperand())) return ConstantExpr::getPtrToInt(CastOp, P2I->getType()); return nullptr; } case scTruncate: { const SCEVTruncateExpr *ST = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(ST->getOperand())) return ConstantExpr::getTrunc(CastOp, ST->getType()); return nullptr; } case scAddExpr: { const SCEVAddExpr *SA = cast(V); Constant *C = nullptr; for (const SCEV *Op : SA->operands()) { Constant *OpC = BuildConstantFromSCEV(Op); if (!OpC) return nullptr; if (!C) { C = OpC; continue; } assert(!C->getType()->isPointerTy() && "Can only have one pointer, and it must be last"); if (auto *PT = dyn_cast(OpC->getType())) { // The offsets have been converted to bytes. We can add bytes to an // i8* by GEP with the byte count in the first index. Type *DestPtrTy = Type::getInt8PtrTy(PT->getContext(), PT->getAddressSpace()); OpC = ConstantExpr::getBitCast(OpC, DestPtrTy); C = ConstantExpr::getGetElementPtr(Type::getInt8Ty(C->getContext()), OpC, C); } else { C = ConstantExpr::getAdd(C, OpC); } } return C; } case scMulExpr: { const SCEVMulExpr *SM = cast(V); Constant *C = nullptr; for (const SCEV *Op : SM->operands()) { assert(!Op->getType()->isPointerTy() && "Can't multiply pointers"); Constant *OpC = BuildConstantFromSCEV(Op); if (!OpC) return nullptr; C = C ? ConstantExpr::getMul(C, OpC) : OpC; } return C; } case scUDivExpr: case scSMaxExpr: case scUMaxExpr: case scSMinExpr: case scUMinExpr: case scSequentialUMinExpr: return nullptr; // TODO: smax, umax, smin, umax, umin_seq. } llvm_unreachable("Unknown SCEV kind!"); } const SCEV * ScalarEvolution::getWithOperands(const SCEV *S, SmallVectorImpl &NewOps) { switch (S->getSCEVType()) { case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: return getCastExpr(S->getSCEVType(), NewOps[0], S->getType()); case scAddRecExpr: { auto *AddRec = cast(S); return getAddRecExpr(NewOps, AddRec->getLoop(), AddRec->getNoWrapFlags()); } case scAddExpr: return getAddExpr(NewOps, cast(S)->getNoWrapFlags()); case scMulExpr: return getMulExpr(NewOps, cast(S)->getNoWrapFlags()); case scUDivExpr: return getUDivExpr(NewOps[0], NewOps[1]); case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: return getMinMaxExpr(S->getSCEVType(), NewOps); case scSequentialUMinExpr: return getSequentialMinMaxExpr(S->getSCEVType(), NewOps); case scConstant: case scVScale: case scUnknown: return S; case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { switch (V->getSCEVType()) { case scConstant: case scVScale: return V; case scAddRecExpr: { // If this is a loop recurrence for a loop that does not contain L, then we // are dealing with the final value computed by the loop. const SCEVAddRecExpr *AddRec = cast(V); // First, attempt to evaluate each operand. // Avoid performing the look-up in the common case where the specified // expression has no loop-variant portions. for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) { const SCEV *OpAtScope = getSCEVAtScope(AddRec->getOperand(i), L); if (OpAtScope == AddRec->getOperand(i)) continue; // Okay, at least one of these operands is loop variant but might be // foldable. Build a new instance of the folded commutative expression. SmallVector NewOps; NewOps.reserve(AddRec->getNumOperands()); append_range(NewOps, AddRec->operands().take_front(i)); NewOps.push_back(OpAtScope); for (++i; i != e; ++i) NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L)); const SCEV *FoldedRec = getAddRecExpr( NewOps, AddRec->getLoop(), AddRec->getNoWrapFlags(SCEV::FlagNW)); AddRec = dyn_cast(FoldedRec); // The addrec may be folded to a nonrecurrence, for example, if the // induction variable is multiplied by zero after constant folding. Go // ahead and return the folded value. if (!AddRec) return FoldedRec; break; } // If the scope is outside the addrec's loop, evaluate it by using the // loop exit value of the addrec. if (!AddRec->getLoop()->contains(L)) { // To evaluate this recurrence, we need to know how many times the AddRec // loop iterates. Compute this now. const SCEV *BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop()); if (BackedgeTakenCount == getCouldNotCompute()) return AddRec; // Then, evaluate the AddRec. return AddRec->evaluateAtIteration(BackedgeTakenCount, *this); } return AddRec; } case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: case scAddExpr: case scMulExpr: case scUDivExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: { ArrayRef Ops = V->operands(); // Avoid performing the look-up in the common case where the specified // expression has no loop-variant portions. for (unsigned i = 0, e = Ops.size(); i != e; ++i) { const SCEV *OpAtScope = getSCEVAtScope(Ops[i], L); if (OpAtScope != Ops[i]) { // Okay, at least one of these operands is loop variant but might be // foldable. Build a new instance of the folded commutative expression. SmallVector NewOps; NewOps.reserve(Ops.size()); append_range(NewOps, Ops.take_front(i)); NewOps.push_back(OpAtScope); for (++i; i != e; ++i) { OpAtScope = getSCEVAtScope(Ops[i], L); NewOps.push_back(OpAtScope); } return getWithOperands(V, NewOps); } } // If we got here, all operands are loop invariant. return V; } case scUnknown: { // If this instruction is evolved from a constant-evolving PHI, compute the // exit value from the loop without using SCEVs. const SCEVUnknown *SU = cast(V); Instruction *I = dyn_cast(SU->getValue()); if (!I) return V; // This is some other type of SCEVUnknown, just return it. if (PHINode *PN = dyn_cast(I)) { const Loop *CurrLoop = this->LI[I->getParent()]; // Looking for loop exit value. if (CurrLoop && CurrLoop->getParentLoop() == L && PN->getParent() == CurrLoop->getHeader()) { // Okay, there is no closed form solution for the PHI node. Check // to see if the loop that contains it has a known backedge-taken // count. If so, we may be able to force computation of the exit // value. const SCEV *BackedgeTakenCount = getBackedgeTakenCount(CurrLoop); // This trivial case can show up in some degenerate cases where // the incoming IR has not yet been fully simplified. if (BackedgeTakenCount->isZero()) { Value *InitValue = nullptr; bool MultipleInitValues = false; for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { if (!CurrLoop->contains(PN->getIncomingBlock(i))) { if (!InitValue) InitValue = PN->getIncomingValue(i); else if (InitValue != PN->getIncomingValue(i)) { MultipleInitValues = true; break; } } } if (!MultipleInitValues && InitValue) return getSCEV(InitValue); } // Do we have a loop invariant value flowing around the backedge // for a loop which must execute the backedge? if (!isa(BackedgeTakenCount) && isKnownPositive(BackedgeTakenCount) && PN->getNumIncomingValues() == 2) { unsigned InLoopPred = CurrLoop->contains(PN->getIncomingBlock(0)) ? 0 : 1; Value *BackedgeVal = PN->getIncomingValue(InLoopPred); if (CurrLoop->isLoopInvariant(BackedgeVal)) return getSCEV(BackedgeVal); } if (auto *BTCC = dyn_cast(BackedgeTakenCount)) { // Okay, we know how many times the containing loop executes. If // this is a constant evolving PHI node, get the final value at // the specified iteration number. Constant *RV = getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), CurrLoop); if (RV) return getSCEV(RV); } } } // Okay, this is an expression that we cannot symbolically evaluate // into a SCEV. Check to see if it's possible to symbolically evaluate // the arguments into constants, and if so, try to constant propagate the // result. This is particularly useful for computing loop exit values. if (!CanConstantFold(I)) return V; // This is some other type of SCEVUnknown, just return it. SmallVector Operands; Operands.reserve(I->getNumOperands()); bool MadeImprovement = false; for (Value *Op : I->operands()) { if (Constant *C = dyn_cast(Op)) { Operands.push_back(C); continue; } // If any of the operands is non-constant and if they are // non-integer and non-pointer, don't even try to analyze them // with scev techniques. if (!isSCEVable(Op->getType())) return V; const SCEV *OrigV = getSCEV(Op); const SCEV *OpV = getSCEVAtScope(OrigV, L); MadeImprovement |= OrigV != OpV; Constant *C = BuildConstantFromSCEV(OpV); if (!C) return V; if (C->getType() != Op->getType()) C = ConstantExpr::getCast( CastInst::getCastOpcode(C, false, Op->getType(), false), C, Op->getType()); Operands.push_back(C); } // Check to see if getSCEVAtScope actually made an improvement. if (!MadeImprovement) return V; // This is some other type of SCEVUnknown, just return it. Constant *C = nullptr; const DataLayout &DL = getDataLayout(); C = ConstantFoldInstOperands(I, Operands, DL, &TLI); if (!C) return V; return getSCEV(C); } case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV type!"); } const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) { return getSCEVAtScope(getSCEV(V), L); } const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const { if (const SCEVZeroExtendExpr *ZExt = dyn_cast(S)) return stripInjectiveFunctions(ZExt->getOperand()); if (const SCEVSignExtendExpr *SExt = dyn_cast(S)) return stripInjectiveFunctions(SExt->getOperand()); return S; } /// Finds the minimum unsigned root of the following equation: /// /// A * X = B (mod N) /// /// where N = 2^BW and BW is the common bit width of A and B. The signedness of /// A and B isn't important. /// /// If the equation does not have a solution, SCEVCouldNotCompute is returned. static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B, ScalarEvolution &SE) { uint32_t BW = A.getBitWidth(); assert(BW == SE.getTypeSizeInBits(B->getType())); assert(A != 0 && "A must be non-zero."); // 1. D = gcd(A, N) // // The gcd of A and N may have only one prime factor: 2. The number of // trailing zeros in A is its multiplicity uint32_t Mult2 = A.countr_zero(); // D = 2^Mult2 // 2. Check if B is divisible by D. // // B is divisible by D if and only if the multiplicity of prime factor 2 for B // is not less than multiplicity of this prime factor for D. if (SE.getMinTrailingZeros(B) < Mult2) return SE.getCouldNotCompute(); // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic // modulo (N / D). // // If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent // (N / D) in general. The inverse itself always fits into BW bits, though, // so we immediately truncate it. APInt AD = A.lshr(Mult2).zext(BW + 1); // AD = A / D APInt Mod(BW + 1, 0); Mod.setBit(BW - Mult2); // Mod = N / D APInt I = AD.multiplicativeInverse(Mod).trunc(BW); // 4. Compute the minimum unsigned root of the equation: // I * (B / D) mod (N / D) // To simplify the computation, we factor out the divide by D: // (I * B mod N) / D const SCEV *D = SE.getConstant(APInt::getOneBitSet(BW, Mult2)); return SE.getUDivExactExpr(SE.getMulExpr(B, SE.getConstant(I)), D); } /// For a given quadratic addrec, generate coefficients of the corresponding /// quadratic equation, multiplied by a common value to ensure that they are /// integers. /// The returned value is a tuple { A, B, C, M, BitWidth }, where /// Ax^2 + Bx + C is the quadratic function, M is the value that A, B and C /// were multiplied by, and BitWidth is the bit width of the original addrec /// coefficients. /// This function returns std::nullopt if the addrec coefficients are not /// compile- time constants. static std::optional> GetQuadraticEquation(const SCEVAddRecExpr *AddRec) { assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!"); const SCEVConstant *LC = dyn_cast(AddRec->getOperand(0)); const SCEVConstant *MC = dyn_cast(AddRec->getOperand(1)); const SCEVConstant *NC = dyn_cast(AddRec->getOperand(2)); LLVM_DEBUG(dbgs() << __func__ << ": analyzing quadratic addrec: " << *AddRec << '\n'); // We currently can only solve this if the coefficients are constants. if (!LC || !MC || !NC) { LLVM_DEBUG(dbgs() << __func__ << ": coefficients are not constant\n"); return std::nullopt; } APInt L = LC->getAPInt(); APInt M = MC->getAPInt(); APInt N = NC->getAPInt(); assert(!N.isZero() && "This is not a quadratic addrec"); unsigned BitWidth = LC->getAPInt().getBitWidth(); unsigned NewWidth = BitWidth + 1; LLVM_DEBUG(dbgs() << __func__ << ": addrec coeff bw: " << BitWidth << '\n'); // The sign-extension (as opposed to a zero-extension) here matches the // extension used in SolveQuadraticEquationWrap (with the same motivation). N = N.sext(NewWidth); M = M.sext(NewWidth); L = L.sext(NewWidth); // The increments are M, M+N, M+2N, ..., so the accumulated values are // L+M, (L+M)+(M+N), (L+M)+(M+N)+(M+2N), ..., that is, // L+M, L+2M+N, L+3M+3N, ... // After n iterations the accumulated value Acc is L + nM + n(n-1)/2 N. // // The equation Acc = 0 is then // L + nM + n(n-1)/2 N = 0, or 2L + 2M n + n(n-1) N = 0. // In a quadratic form it becomes: // N n^2 + (2M-N) n + 2L = 0. APInt A = N; APInt B = 2 * M - A; APInt C = 2 * L; APInt T = APInt(NewWidth, 2); LLVM_DEBUG(dbgs() << __func__ << ": equation " << A << "x^2 + " << B << "x + " << C << ", coeff bw: " << NewWidth << ", multiplied by " << T << '\n'); return std::make_tuple(A, B, C, T, BitWidth); } /// Helper function to compare optional APInts: /// (a) if X and Y both exist, return min(X, Y), /// (b) if neither X nor Y exist, return std::nullopt, /// (c) if exactly one of X and Y exists, return that value. static std::optional MinOptional(std::optional X, std::optional Y) { if (X && Y) { unsigned W = std::max(X->getBitWidth(), Y->getBitWidth()); APInt XW = X->sext(W); APInt YW = Y->sext(W); return XW.slt(YW) ? *X : *Y; } if (!X && !Y) return std::nullopt; return X ? *X : *Y; } /// Helper function to truncate an optional APInt to a given BitWidth. /// When solving addrec-related equations, it is preferable to return a value /// that has the same bit width as the original addrec's coefficients. If the /// solution fits in the original bit width, truncate it (except for i1). /// Returning a value of a different bit width may inhibit some optimizations. /// /// In general, a solution to a quadratic equation generated from an addrec /// may require BW+1 bits, where BW is the bit width of the addrec's /// coefficients. The reason is that the coefficients of the quadratic /// equation are BW+1 bits wide (to avoid truncation when converting from /// the addrec to the equation). static std::optional TruncIfPossible(std::optional X, unsigned BitWidth) { if (!X) return std::nullopt; unsigned W = X->getBitWidth(); if (BitWidth > 1 && BitWidth < W && X->isIntN(BitWidth)) return X->trunc(BitWidth); return X; } /// Let c(n) be the value of the quadratic chrec {L,+,M,+,N} after n /// iterations. The values L, M, N are assumed to be signed, and they /// should all have the same bit widths. /// Find the least n >= 0 such that c(n) = 0 in the arithmetic modulo 2^BW, /// where BW is the bit width of the addrec's coefficients. /// If the calculated value is a BW-bit integer (for BW > 1), it will be /// returned as such, otherwise the bit width of the returned value may /// be greater than BW. /// /// This function returns std::nullopt if /// (a) the addrec coefficients are not constant, or /// (b) SolveQuadraticEquationWrap was unable to find a solution. For cases /// like x^2 = 5, no integer solutions exist, in other cases an integer /// solution may exist, but SolveQuadraticEquationWrap may fail to find it. static std::optional SolveQuadraticAddRecExact(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) { APInt A, B, C, M; unsigned BitWidth; auto T = GetQuadraticEquation(AddRec); if (!T) return std::nullopt; std::tie(A, B, C, M, BitWidth) = *T; LLVM_DEBUG(dbgs() << __func__ << ": solving for unsigned overflow\n"); std::optional X = APIntOps::SolveQuadraticEquationWrap(A, B, C, BitWidth + 1); if (!X) return std::nullopt; ConstantInt *CX = ConstantInt::get(SE.getContext(), *X); ConstantInt *V = EvaluateConstantChrecAtConstant(AddRec, CX, SE); if (!V->isZero()) return std::nullopt; return TruncIfPossible(X, BitWidth); } /// Let c(n) be the value of the quadratic chrec {0,+,M,+,N} after n /// iterations. The values M, N are assumed to be signed, and they /// should all have the same bit widths. /// Find the least n such that c(n) does not belong to the given range, /// while c(n-1) does. /// /// This function returns std::nullopt if /// (a) the addrec coefficients are not constant, or /// (b) SolveQuadraticEquationWrap was unable to find a solution for the /// bounds of the range. static std::optional SolveQuadraticAddRecRange(const SCEVAddRecExpr *AddRec, const ConstantRange &Range, ScalarEvolution &SE) { assert(AddRec->getOperand(0)->isZero() && "Starting value of addrec should be 0"); LLVM_DEBUG(dbgs() << __func__ << ": solving boundary crossing for range " << Range << ", addrec " << *AddRec << '\n'); // This case is handled in getNumIterationsInRange. Here we can assume that // we start in the range. assert(Range.contains(APInt(SE.getTypeSizeInBits(AddRec->getType()), 0)) && "Addrec's initial value should be in range"); APInt A, B, C, M; unsigned BitWidth; auto T = GetQuadraticEquation(AddRec); if (!T) return std::nullopt; // Be careful about the return value: there can be two reasons for not // returning an actual number. First, if no solutions to the equations // were found, and second, if the solutions don't leave the given range. // The first case means that the actual solution is "unknown", the second // means that it's known, but not valid. If the solution is unknown, we // cannot make any conclusions. // Return a pair: the optional solution and a flag indicating if the // solution was found. auto SolveForBoundary = [&](APInt Bound) -> std::pair, bool> { // Solve for signed overflow and unsigned overflow, pick the lower // solution. LLVM_DEBUG(dbgs() << "SolveQuadraticAddRecRange: checking boundary " << Bound << " (before multiplying by " << M << ")\n"); Bound *= M; // The quadratic equation multiplier. std::optional SO; if (BitWidth > 1) { LLVM_DEBUG(dbgs() << "SolveQuadraticAddRecRange: solving for " "signed overflow\n"); SO = APIntOps::SolveQuadraticEquationWrap(A, B, -Bound, BitWidth); } LLVM_DEBUG(dbgs() << "SolveQuadraticAddRecRange: solving for " "unsigned overflow\n"); std::optional UO = APIntOps::SolveQuadraticEquationWrap(A, B, -Bound, BitWidth + 1); auto LeavesRange = [&] (const APInt &X) { ConstantInt *C0 = ConstantInt::get(SE.getContext(), X); ConstantInt *V0 = EvaluateConstantChrecAtConstant(AddRec, C0, SE); if (Range.contains(V0->getValue())) return false; // X should be at least 1, so X-1 is non-negative. ConstantInt *C1 = ConstantInt::get(SE.getContext(), X-1); ConstantInt *V1 = EvaluateConstantChrecAtConstant(AddRec, C1, SE); if (Range.contains(V1->getValue())) return true; return false; }; // If SolveQuadraticEquationWrap returns std::nullopt, it means that there // can be a solution, but the function failed to find it. We cannot treat it // as "no solution". if (!SO || !UO) return {std::nullopt, false}; // Check the smaller value first to see if it leaves the range. // At this point, both SO and UO must have values. std::optional Min = MinOptional(SO, UO); if (LeavesRange(*Min)) return { Min, true }; std::optional Max = Min == SO ? UO : SO; if (LeavesRange(*Max)) return { Max, true }; // Solutions were found, but were eliminated, hence the "true". return {std::nullopt, true}; }; std::tie(A, B, C, M, BitWidth) = *T; // Lower bound is inclusive, subtract 1 to represent the exiting value. APInt Lower = Range.getLower().sext(A.getBitWidth()) - 1; APInt Upper = Range.getUpper().sext(A.getBitWidth()); auto SL = SolveForBoundary(Lower); auto SU = SolveForBoundary(Upper); // If any of the solutions was unknown, no meaninigful conclusions can // be made. if (!SL.second || !SU.second) return std::nullopt; // Claim: The correct solution is not some value between Min and Max. // // Justification: Assuming that Min and Max are different values, one of // them is when the first signed overflow happens, the other is when the // first unsigned overflow happens. Crossing the range boundary is only // possible via an overflow (treating 0 as a special case of it, modeling // an overflow as crossing k*2^W for some k). // // The interesting case here is when Min was eliminated as an invalid // solution, but Max was not. The argument is that if there was another // overflow between Min and Max, it would also have been eliminated if // it was considered. // // For a given boundary, it is possible to have two overflows of the same // type (signed/unsigned) without having the other type in between: this // can happen when the vertex of the parabola is between the iterations // corresponding to the overflows. This is only possible when the two // overflows cross k*2^W for the same k. In such case, if the second one // left the range (and was the first one to do so), the first overflow // would have to enter the range, which would mean that either we had left // the range before or that we started outside of it. Both of these cases // are contradictions. // // Claim: In the case where SolveForBoundary returns std::nullopt, the correct // solution is not some value between the Max for this boundary and the // Min of the other boundary. // // Justification: Assume that we had such Max_A and Min_B corresponding // to range boundaries A and B and such that Max_A < Min_B. If there was // a solution between Max_A and Min_B, it would have to be caused by an // overflow corresponding to either A or B. It cannot correspond to B, // since Min_B is the first occurrence of such an overflow. If it // corresponded to A, it would have to be either a signed or an unsigned // overflow that is larger than both eliminated overflows for A. But // between the eliminated overflows and this overflow, the values would // cover the entire value space, thus crossing the other boundary, which // is a contradiction. return TruncIfPossible(MinOptional(SL.first, SU.first), BitWidth); } ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsOnlyExit, bool AllowPredicates) { // This is only used for loops with a "x != y" exit test. The exit condition // is now expressed as a single expression, V = x-y. So the exit test is // effectively V != 0. We know and take advantage of the fact that this // expression only being used in a comparison by zero context. SmallPtrSet Predicates; // If the value is a constant if (const SCEVConstant *C = dyn_cast(V)) { // If the value is already zero, the branch will execute zero times. if (C->getValue()->isZero()) return C; return getCouldNotCompute(); // Otherwise it will loop infinitely. } const SCEVAddRecExpr *AddRec = dyn_cast(stripInjectiveFunctions(V)); if (!AddRec && AllowPredicates) // Try to make this an AddRec using runtime tests, in the first X // iterations of this loop, where X is the SCEV expression found by the // algorithm below. AddRec = convertSCEVToAddRecWithPredicates(V, L, Predicates); if (!AddRec || AddRec->getLoop() != L) return getCouldNotCompute(); // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of // the quadratic equation to solve it. if (AddRec->isQuadratic() && AddRec->getType()->isIntegerTy()) { // We can only use this value if the chrec ends up with an exact zero // value at this index. When solving for "X*X != 5", for example, we // should not accept a root of 2. if (auto S = SolveQuadraticAddRecExact(AddRec, *this)) { const auto *R = cast(getConstant(*S)); return ExitLimit(R, R, R, false, Predicates); } return getCouldNotCompute(); } // Otherwise we can only handle this if it is affine. if (!AddRec->isAffine()) return getCouldNotCompute(); // If this is an affine expression, the execution count of this branch is // the minimum unsigned root of the following equation: // // Start + Step*N = 0 (mod 2^BW) // // equivalent to: // // Step*N = -Start (mod 2^BW) // // where BW is the common bit width of Start and Step. // Get the initial value for the loop. const SCEV *Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop()); const SCEV *Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop()); // For now we handle only constant steps. // // TODO: Handle a nonconstant Step given AddRec. If the // AddRec is NUW, then (in an unsigned sense) it cannot be counting up to wrap // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step. // We have not yet seen any such cases. const SCEVConstant *StepC = dyn_cast(Step); if (!StepC || StepC->getValue()->isZero()) return getCouldNotCompute(); // For positive steps (counting up until unsigned overflow): // N = -Start/Step (as unsigned) // For negative steps (counting down to zero): // N = Start/-Step // First compute the unsigned distance from zero in the direction of Step. bool CountDown = StepC->getAPInt().isNegative(); const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start); // Handle unitary steps, which cannot wraparound. // 1*N = -Start; -1*N = Start (mod 2^BW), so: // N = Distance (as unsigned) if (StepC->getValue()->isOne() || StepC->getValue()->isMinusOne()) { APInt MaxBECount = getUnsignedRangeMax(applyLoopGuards(Distance, L)); MaxBECount = APIntOps::umin(MaxBECount, getUnsignedRangeMax(Distance)); // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated, // we end up with a loop whose backedge-taken count is n - 1. Detect this // case, and see if we can improve the bound. // // Explicitly handling this here is necessary because getUnsignedRange // isn't context-sensitive; it doesn't know that we only care about the // range inside the loop. const SCEV *Zero = getZero(Distance->getType()); const SCEV *One = getOne(Distance->getType()); const SCEV *DistancePlusOne = getAddExpr(Distance, One); if (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, DistancePlusOne, Zero)) { // If Distance + 1 doesn't overflow, we can compute the maximum distance // as "unsigned_max(Distance + 1) - 1". ConstantRange CR = getUnsignedRange(DistancePlusOne); MaxBECount = APIntOps::umin(MaxBECount, CR.getUnsignedMax() - 1); } return ExitLimit(Distance, getConstant(MaxBECount), Distance, false, Predicates); } // If the condition controls loop exit (the loop exits only if the expression // is true) and the addition is no-wrap we can use unsigned divide to // compute the backedge count. In this case, the step may not divide the // distance, but we don't care because if the condition is "missed" the loop // will have undefined behavior due to wrapping. if (ControlsOnlyExit && AddRec->hasNoSelfWrap() && loopHasNoAbnormalExits(AddRec->getLoop())) { const SCEV *Exact = getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step); const SCEV *ConstantMax = getCouldNotCompute(); if (Exact != getCouldNotCompute()) { APInt MaxInt = getUnsignedRangeMax(applyLoopGuards(Exact, L)); ConstantMax = getConstant(APIntOps::umin(MaxInt, getUnsignedRangeMax(Exact))); } const SCEV *SymbolicMax = isa(Exact) ? ConstantMax : Exact; return ExitLimit(Exact, ConstantMax, SymbolicMax, false, Predicates); } // Solve the general equation. const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(), getNegativeSCEV(Start), *this); const SCEV *M = E; if (E != getCouldNotCompute()) { APInt MaxWithGuards = getUnsignedRangeMax(applyLoopGuards(E, L)); M = getConstant(APIntOps::umin(MaxWithGuards, getUnsignedRangeMax(E))); } auto *S = isa(E) ? M : E; return ExitLimit(E, M, S, false, Predicates); } ScalarEvolution::ExitLimit ScalarEvolution::howFarToNonZero(const SCEV *V, const Loop *L) { // Loops that look like: while (X == 0) are very strange indeed. We don't // handle them yet except for the trivial case. This could be expanded in the // future as needed. // If the value is a constant, check to see if it is known to be non-zero // already. If so, the backedge will execute zero times. if (const SCEVConstant *C = dyn_cast(V)) { if (!C->getValue()->isZero()) return getZero(C->getType()); return getCouldNotCompute(); // Otherwise it will loop infinitely. } // We could implement others, but I really doubt anyone writes loops like // this, and if they did, they would already be constant folded. return getCouldNotCompute(); } std::pair ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) const { // If the block has a unique predecessor, then there is no path from the // predecessor to the block that does not go through the direct edge // from the predecessor to the block. if (const BasicBlock *Pred = BB->getSinglePredecessor()) return {Pred, BB}; // A loop's header is defined to be a block that dominates the loop. // If the header has a unique predecessor outside the loop, it must be // a block that has exactly one successor that can reach the loop. if (const Loop *L = LI.getLoopFor(BB)) return {L->getLoopPredecessor(), L->getHeader()}; return {nullptr, nullptr}; } /// SCEV structural equivalence is usually sufficient for testing whether two /// expressions are equal, however for the purposes of looking for a condition /// guarding a loop, it can be useful to be a little more general, since a /// front-end may have replicated the controlling expression. static bool HasSameValue(const SCEV *A, const SCEV *B) { // Quick check to see if they are the same SCEV. if (A == B) return true; auto ComputesEqualValues = [](const Instruction *A, const Instruction *B) { // Not all instructions that are "identical" compute the same value. For // instance, two distinct alloca instructions allocating the same type are // identical and do not read memory; but compute distinct values. return A->isIdenticalTo(B) && (isa(A) || isa(A)); }; // Otherwise, if they're both SCEVUnknown, it's possible that they hold // two different instructions with the same value. Check for this case. if (const SCEVUnknown *AU = dyn_cast(A)) if (const SCEVUnknown *BU = dyn_cast(B)) if (const Instruction *AI = dyn_cast(AU->getValue())) if (const Instruction *BI = dyn_cast(BU->getValue())) if (ComputesEqualValues(AI, BI)) return true; // Otherwise assume they may have a different value. return false; } bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, const SCEV *&LHS, const SCEV *&RHS, unsigned Depth) { bool Changed = false; // Simplifies ICMP to trivial true or false by turning it into '0 == 0' or // '0 != 0'. auto TrivialCase = [&](bool TriviallyTrue) { LHS = RHS = getConstant(ConstantInt::getFalse(getContext())); Pred = TriviallyTrue ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; return true; }; // If we hit the max recursion limit bail out. if (Depth >= 3) return false; // Canonicalize a constant to the right side. if (const SCEVConstant *LHSC = dyn_cast(LHS)) { // Check for both operands constant. if (const SCEVConstant *RHSC = dyn_cast(RHS)) { if (ConstantExpr::getICmp(Pred, LHSC->getValue(), RHSC->getValue())->isNullValue()) return TrivialCase(false); return TrivialCase(true); } // Otherwise swap the operands to put the constant on the right. std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); Changed = true; } // If we're comparing an addrec with a value which is loop-invariant in the // addrec's loop, put the addrec on the left. Also make a dominance check, // as both operands could be addrecs loop-invariant in each other's loop. if (const SCEVAddRecExpr *AR = dyn_cast(RHS)) { const Loop *L = AR->getLoop(); if (isLoopInvariant(LHS, L) && properlyDominates(LHS, L->getHeader())) { std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); Changed = true; } } // If there's a constant operand, canonicalize comparisons with boundary // cases, and canonicalize *-or-equal comparisons to regular comparisons. if (const SCEVConstant *RC = dyn_cast(RHS)) { const APInt &RA = RC->getAPInt(); bool SimplifiedByConstantRange = false; if (!ICmpInst::isEquality(Pred)) { ConstantRange ExactCR = ConstantRange::makeExactICmpRegion(Pred, RA); if (ExactCR.isFullSet()) return TrivialCase(true); if (ExactCR.isEmptySet()) return TrivialCase(false); APInt NewRHS; CmpInst::Predicate NewPred; if (ExactCR.getEquivalentICmp(NewPred, NewRHS) && ICmpInst::isEquality(NewPred)) { // We were able to convert an inequality to an equality. Pred = NewPred; RHS = getConstant(NewRHS); Changed = SimplifiedByConstantRange = true; } } if (!SimplifiedByConstantRange) { switch (Pred) { default: break; case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_NE: // Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b. if (!RA) if (const SCEVAddExpr *AE = dyn_cast(LHS)) if (const SCEVMulExpr *ME = dyn_cast(AE->getOperand(0))) if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) { RHS = AE->getOperand(1); LHS = ME->getOperand(1); Changed = true; } break; // The "Should have been caught earlier!" messages refer to the fact // that the ExactCR.isFullSet() or ExactCR.isEmptySet() check above // should have fired on the corresponding cases, and canonicalized the // check to trivial case. case ICmpInst::ICMP_UGE: assert(!RA.isMinValue() && "Should have been caught earlier!"); Pred = ICmpInst::ICMP_UGT; RHS = getConstant(RA - 1); Changed = true; break; case ICmpInst::ICMP_ULE: assert(!RA.isMaxValue() && "Should have been caught earlier!"); Pred = ICmpInst::ICMP_ULT; RHS = getConstant(RA + 1); Changed = true; break; case ICmpInst::ICMP_SGE: assert(!RA.isMinSignedValue() && "Should have been caught earlier!"); Pred = ICmpInst::ICMP_SGT; RHS = getConstant(RA - 1); Changed = true; break; case ICmpInst::ICMP_SLE: assert(!RA.isMaxSignedValue() && "Should have been caught earlier!"); Pred = ICmpInst::ICMP_SLT; RHS = getConstant(RA + 1); Changed = true; break; } } } // Check for obvious equality. if (HasSameValue(LHS, RHS)) { if (ICmpInst::isTrueWhenEqual(Pred)) return TrivialCase(true); if (ICmpInst::isFalseWhenEqual(Pred)) return TrivialCase(false); } // If possible, canonicalize GE/LE comparisons to GT/LT comparisons, by // adding or subtracting 1 from one of the operands. switch (Pred) { case ICmpInst::ICMP_SLE: if (!getSignedRangeMax(RHS).isMaxSignedValue()) { RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SLT; Changed = true; } else if (!getSignedRangeMin(LHS).isMinSignedValue()) { LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SLT; Changed = true; } break; case ICmpInst::ICMP_SGE: if (!getSignedRangeMin(RHS).isMinSignedValue()) { RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SGT; Changed = true; } else if (!getSignedRangeMax(LHS).isMaxSignedValue()) { LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SGT; Changed = true; } break; case ICmpInst::ICMP_ULE: if (!getUnsignedRangeMax(RHS).isMaxValue()) { RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS, SCEV::FlagNUW); Pred = ICmpInst::ICMP_ULT; Changed = true; } else if (!getUnsignedRangeMin(LHS).isMinValue()) { LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS); Pred = ICmpInst::ICMP_ULT; Changed = true; } break; case ICmpInst::ICMP_UGE: if (!getUnsignedRangeMin(RHS).isMinValue()) { RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS); Pred = ICmpInst::ICMP_UGT; Changed = true; } else if (!getUnsignedRangeMax(LHS).isMaxValue()) { LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS, SCEV::FlagNUW); Pred = ICmpInst::ICMP_UGT; Changed = true; } break; default: break; } // TODO: More simplifications are possible here. // Recursively simplify until we either hit a recursion limit or nothing // changes. if (Changed) return SimplifyICmpOperands(Pred, LHS, RHS, Depth + 1); return Changed; } bool ScalarEvolution::isKnownNegative(const SCEV *S) { return getSignedRangeMax(S).isNegative(); } bool ScalarEvolution::isKnownPositive(const SCEV *S) { return getSignedRangeMin(S).isStrictlyPositive(); } bool ScalarEvolution::isKnownNonNegative(const SCEV *S) { return !getSignedRangeMin(S).isNegative(); } bool ScalarEvolution::isKnownNonPositive(const SCEV *S) { return !getSignedRangeMax(S).isStrictlyPositive(); } bool ScalarEvolution::isKnownNonZero(const SCEV *S) { return getUnsignedRangeMin(S) != 0; } std::pair ScalarEvolution::SplitIntoInitAndPostInc(const Loop *L, const SCEV *S) { // Compute SCEV on entry of loop L. const SCEV *Start = SCEVInitRewriter::rewrite(S, L, *this); if (Start == getCouldNotCompute()) return { Start, Start }; // Compute post increment SCEV for loop L. const SCEV *PostInc = SCEVPostIncRewriter::rewrite(S, L, *this); assert(PostInc != getCouldNotCompute() && "Unexpected could not compute"); return { Start, PostInc }; } bool ScalarEvolution::isKnownViaInduction(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // First collect all loops. SmallPtrSet LoopsUsed; getUsedLoops(LHS, LoopsUsed); getUsedLoops(RHS, LoopsUsed); if (LoopsUsed.empty()) return false; // Domination relationship must be a linear order on collected loops. #ifndef NDEBUG for (const auto *L1 : LoopsUsed) for (const auto *L2 : LoopsUsed) assert((DT.dominates(L1->getHeader(), L2->getHeader()) || DT.dominates(L2->getHeader(), L1->getHeader())) && "Domination relationship is not a linear order"); #endif const Loop *MDL = *std::max_element(LoopsUsed.begin(), LoopsUsed.end(), [&](const Loop *L1, const Loop *L2) { return DT.properlyDominates(L1->getHeader(), L2->getHeader()); }); // Get init and post increment value for LHS. auto SplitLHS = SplitIntoInitAndPostInc(MDL, LHS); // if LHS contains unknown non-invariant SCEV then bail out. if (SplitLHS.first == getCouldNotCompute()) return false; assert (SplitLHS.second != getCouldNotCompute() && "Unexpected CNC"); // Get init and post increment value for RHS. auto SplitRHS = SplitIntoInitAndPostInc(MDL, RHS); // if RHS contains unknown non-invariant SCEV then bail out. if (SplitRHS.first == getCouldNotCompute()) return false; assert (SplitRHS.second != getCouldNotCompute() && "Unexpected CNC"); // It is possible that init SCEV contains an invariant load but it does // not dominate MDL and is not available at MDL loop entry, so we should // check it here. if (!isAvailableAtLoopEntry(SplitLHS.first, MDL) || !isAvailableAtLoopEntry(SplitRHS.first, MDL)) return false; // It seems backedge guard check is faster than entry one so in some cases // it can speed up whole estimation by short circuit return isLoopBackedgeGuardedByCond(MDL, Pred, SplitLHS.second, SplitRHS.second) && isLoopEntryGuardedByCond(MDL, Pred, SplitLHS.first, SplitRHS.first); } bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Canonicalize the inputs first. (void)SimplifyICmpOperands(Pred, LHS, RHS); if (isKnownViaInduction(Pred, LHS, RHS)) return true; if (isKnownPredicateViaSplitting(Pred, LHS, RHS)) return true; // Otherwise see what can be done with some simple reasoning. return isKnownViaNonRecursiveReasoning(Pred, LHS, RHS); } std::optional ScalarEvolution::evaluatePredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { if (isKnownPredicate(Pred, LHS, RHS)) return true; if (isKnownPredicate(ICmpInst::getInversePredicate(Pred), LHS, RHS)) return false; return std::nullopt; } bool ScalarEvolution::isKnownPredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *CtxI) { // TODO: Analyze guards and assumes from Context's block. return isKnownPredicate(Pred, LHS, RHS) || isBasicBlockEntryGuardedByCond(CtxI->getParent(), Pred, LHS, RHS); } std::optional ScalarEvolution::evaluatePredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *CtxI) { std::optional KnownWithoutContext = evaluatePredicate(Pred, LHS, RHS); if (KnownWithoutContext) return KnownWithoutContext; if (isBasicBlockEntryGuardedByCond(CtxI->getParent(), Pred, LHS, RHS)) return true; if (isBasicBlockEntryGuardedByCond(CtxI->getParent(), ICmpInst::getInversePredicate(Pred), LHS, RHS)) return false; return std::nullopt; } bool ScalarEvolution::isKnownOnEveryIteration(ICmpInst::Predicate Pred, const SCEVAddRecExpr *LHS, const SCEV *RHS) { const Loop *L = LHS->getLoop(); return isLoopEntryGuardedByCond(L, Pred, LHS->getStart(), RHS) && isLoopBackedgeGuardedByCond(L, Pred, LHS->getPostIncExpr(*this), RHS); } std::optional ScalarEvolution::getMonotonicPredicateType(const SCEVAddRecExpr *LHS, ICmpInst::Predicate Pred) { auto Result = getMonotonicPredicateTypeImpl(LHS, Pred); #ifndef NDEBUG // Verify an invariant: inverting the predicate should turn a monotonically // increasing change to a monotonically decreasing one, and vice versa. if (Result) { auto ResultSwapped = getMonotonicPredicateTypeImpl(LHS, ICmpInst::getSwappedPredicate(Pred)); assert(*ResultSwapped != *Result && "monotonicity should flip as we flip the predicate"); } #endif return Result; } std::optional ScalarEvolution::getMonotonicPredicateTypeImpl(const SCEVAddRecExpr *LHS, ICmpInst::Predicate Pred) { // A zero step value for LHS means the induction variable is essentially a // loop invariant value. We don't really depend on the predicate actually // flipping from false to true (for increasing predicates, and the other way // around for decreasing predicates), all we care about is that *if* the // predicate changes then it only changes from false to true. // // A zero step value in itself is not very useful, but there may be places // where SCEV can prove X >= 0 but not prove X > 0, so it is helpful to be // as general as possible. // Only handle LE/LT/GE/GT predicates. if (!ICmpInst::isRelational(Pred)) return std::nullopt; bool IsGreater = ICmpInst::isGE(Pred) || ICmpInst::isGT(Pred); assert((IsGreater || ICmpInst::isLE(Pred) || ICmpInst::isLT(Pred)) && "Should be greater or less!"); // Check that AR does not wrap. if (ICmpInst::isUnsigned(Pred)) { if (!LHS->hasNoUnsignedWrap()) return std::nullopt; return IsGreater ? MonotonicallyIncreasing : MonotonicallyDecreasing; } assert(ICmpInst::isSigned(Pred) && "Relational predicate is either signed or unsigned!"); if (!LHS->hasNoSignedWrap()) return std::nullopt; const SCEV *Step = LHS->getStepRecurrence(*this); if (isKnownNonNegative(Step)) return IsGreater ? MonotonicallyIncreasing : MonotonicallyDecreasing; if (isKnownNonPositive(Step)) return !IsGreater ? MonotonicallyIncreasing : MonotonicallyDecreasing; return std::nullopt; } std::optional ScalarEvolution::getLoopInvariantPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L, const Instruction *CtxI) { // If there is a loop-invariant, force it into the RHS, otherwise bail out. if (!isLoopInvariant(RHS, L)) { if (!isLoopInvariant(LHS, L)) return std::nullopt; std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); } const SCEVAddRecExpr *ArLHS = dyn_cast(LHS); if (!ArLHS || ArLHS->getLoop() != L) return std::nullopt; auto MonotonicType = getMonotonicPredicateType(ArLHS, Pred); if (!MonotonicType) return std::nullopt; // If the predicate "ArLHS `Pred` RHS" monotonically increases from false to // true as the loop iterates, and the backedge is control dependent on // "ArLHS `Pred` RHS" == true then we can reason as follows: // // * if the predicate was false in the first iteration then the predicate // is never evaluated again, since the loop exits without taking the // backedge. // * if the predicate was true in the first iteration then it will // continue to be true for all future iterations since it is // monotonically increasing. // // For both the above possibilities, we can replace the loop varying // predicate with its value on the first iteration of the loop (which is // loop invariant). // // A similar reasoning applies for a monotonically decreasing predicate, by // replacing true with false and false with true in the above two bullets. bool Increasing = *MonotonicType == ScalarEvolution::MonotonicallyIncreasing; auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred); if (isLoopBackedgeGuardedByCond(L, P, LHS, RHS)) return ScalarEvolution::LoopInvariantPredicate(Pred, ArLHS->getStart(), RHS); if (!CtxI) return std::nullopt; // Try to prove via context. // TODO: Support other cases. switch (Pred) { default: break; case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_ULT: { assert(ArLHS->hasNoUnsignedWrap() && "Is a requirement of monotonicity!"); // Given preconditions // (1) ArLHS does not cross the border of positive and negative parts of // range because of: // - Positive step; (TODO: lift this limitation) // - nuw - does not cross zero boundary; // - nsw - does not cross SINT_MAX boundary; // (2) ArLHS =s 0 // we can replace the loop variant ArLHS ArLHS Start(ArLHS) >=s 0. // We can strengthen this to Start(ArLHS) hasNoSignedWrap() && ArLHS->isAffine() && isKnownPositive(ArLHS->getStepRecurrence(*this)) && isKnownNonNegative(RHS) && isKnownPredicateAt(SignFlippedPred, ArLHS, RHS, CtxI)) return ScalarEvolution::LoopInvariantPredicate(Pred, ArLHS->getStart(), RHS); } } return std::nullopt; } std::optional ScalarEvolution::getLoopInvariantExitCondDuringFirstIterations( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L, const Instruction *CtxI, const SCEV *MaxIter) { if (auto LIP = getLoopInvariantExitCondDuringFirstIterationsImpl( Pred, LHS, RHS, L, CtxI, MaxIter)) return LIP; if (auto *UMin = dyn_cast(MaxIter)) // Number of iterations expressed as UMIN isn't always great for expressing // the value on the last iteration. If the straightforward approach didn't // work, try the following trick: if the a predicate is invariant for X, it // is also invariant for umin(X, ...). So try to find something that works // among subexpressions of MaxIter expressed as umin. for (auto *Op : UMin->operands()) if (auto LIP = getLoopInvariantExitCondDuringFirstIterationsImpl( Pred, LHS, RHS, L, CtxI, Op)) return LIP; return std::nullopt; } std::optional ScalarEvolution::getLoopInvariantExitCondDuringFirstIterationsImpl( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L, const Instruction *CtxI, const SCEV *MaxIter) { // Try to prove the following set of facts: // - The predicate is monotonic in the iteration space. // - If the check does not fail on the 1st iteration: // - No overflow will happen during first MaxIter iterations; // - It will not fail on the MaxIter'th iteration. // If the check does fail on the 1st iteration, we leave the loop and no // other checks matter. // If there is a loop-invariant, force it into the RHS, otherwise bail out. if (!isLoopInvariant(RHS, L)) { if (!isLoopInvariant(LHS, L)) return std::nullopt; std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); } auto *AR = dyn_cast(LHS); if (!AR || AR->getLoop() != L) return std::nullopt; // The predicate must be relational (i.e. <, <=, >=, >). if (!ICmpInst::isRelational(Pred)) return std::nullopt; // TODO: Support steps other than +/- 1. const SCEV *Step = AR->getStepRecurrence(*this); auto *One = getOne(Step->getType()); auto *MinusOne = getNegativeSCEV(One); if (Step != One && Step != MinusOne) return std::nullopt; // Type mismatch here means that MaxIter is potentially larger than max // unsigned value in start type, which mean we cannot prove no wrap for the // indvar. if (AR->getType() != MaxIter->getType()) return std::nullopt; // Value of IV on suggested last iteration. const SCEV *Last = AR->evaluateAtIteration(MaxIter, *this); // Does it still meet the requirement? if (!isLoopBackedgeGuardedByCond(L, Pred, Last, RHS)) return std::nullopt; // Because step is +/- 1 and MaxIter has same type as Start (i.e. it does // not exceed max unsigned value of this type), this effectively proves // that there is no wrap during the iteration. To prove that there is no // signed/unsigned wrap, we need to check that // Start <= Last for step = 1 or Start >= Last for step = -1. ICmpInst::Predicate NoOverflowPred = CmpInst::isSigned(Pred) ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; if (Step == MinusOne) NoOverflowPred = CmpInst::getSwappedPredicate(NoOverflowPred); const SCEV *Start = AR->getStart(); if (!isKnownPredicateAt(NoOverflowPred, Start, Last, CtxI)) return std::nullopt; // Everything is fine. return ScalarEvolution::LoopInvariantPredicate(Pred, Start, RHS); } bool ScalarEvolution::isKnownPredicateViaConstantRanges( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { if (HasSameValue(LHS, RHS)) return ICmpInst::isTrueWhenEqual(Pred); // This code is split out from isKnownPredicate because it is called from // within isLoopEntryGuardedByCond. auto CheckRanges = [&](const ConstantRange &RangeLHS, const ConstantRange &RangeRHS) { return RangeLHS.icmp(Pred, RangeRHS); }; // The check at the top of the function catches the case where the values are // known to be equal. if (Pred == CmpInst::ICMP_EQ) return false; if (Pred == CmpInst::ICMP_NE) { auto SL = getSignedRange(LHS); auto SR = getSignedRange(RHS); if (CheckRanges(SL, SR)) return true; auto UL = getUnsignedRange(LHS); auto UR = getUnsignedRange(RHS); if (CheckRanges(UL, UR)) return true; auto *Diff = getMinusSCEV(LHS, RHS); return !isa(Diff) && isKnownNonZero(Diff); } if (CmpInst::isSigned(Pred)) { auto SL = getSignedRange(LHS); auto SR = getSignedRange(RHS); return CheckRanges(SL, SR); } auto UL = getUnsignedRange(LHS); auto UR = getUnsignedRange(RHS); return CheckRanges(UL, UR); } bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Match X to (A + C1) and Y to (A + C2), where // C1 and C2 are constant integers. If either X or Y are not add expressions, // consider them as X + 0 and Y + 0 respectively. C1 and C2 are returned via // OutC1 and OutC2. auto MatchBinaryAddToConst = [this](const SCEV *X, const SCEV *Y, APInt &OutC1, APInt &OutC2, SCEV::NoWrapFlags ExpectedFlags) { const SCEV *XNonConstOp, *XConstOp; const SCEV *YNonConstOp, *YConstOp; SCEV::NoWrapFlags XFlagsPresent; SCEV::NoWrapFlags YFlagsPresent; if (!splitBinaryAdd(X, XConstOp, XNonConstOp, XFlagsPresent)) { XConstOp = getZero(X->getType()); XNonConstOp = X; XFlagsPresent = ExpectedFlags; } if (!isa(XConstOp) || (XFlagsPresent & ExpectedFlags) != ExpectedFlags) return false; if (!splitBinaryAdd(Y, YConstOp, YNonConstOp, YFlagsPresent)) { YConstOp = getZero(Y->getType()); YNonConstOp = Y; YFlagsPresent = ExpectedFlags; } if (!isa(YConstOp) || (YFlagsPresent & ExpectedFlags) != ExpectedFlags) return false; if (YNonConstOp != XNonConstOp) return false; OutC1 = cast(XConstOp)->getAPInt(); OutC2 = cast(YConstOp)->getAPInt(); return true; }; APInt C1; APInt C2; switch (Pred) { default: break; case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_SLE: // (X + C1) s<= (X + C2) if C1 s<= C2. if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNSW) && C1.sle(C2)) return true; break; case ICmpInst::ICMP_SGT: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_SLT: // (X + C1) s< (X + C2) if C1 s< C2. if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNSW) && C1.slt(C2)) return true; break; case ICmpInst::ICMP_UGE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_ULE: // (X + C1) u<= (X + C2) for C1 u<= C2. if (MatchBinaryAddToConst(RHS, LHS, C2, C1, SCEV::FlagNUW) && C1.ule(C2)) return true; break; case ICmpInst::ICMP_UGT: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_ULT: // (X + C1) u< (X + C2) if C1 u< C2. if (MatchBinaryAddToConst(RHS, LHS, C2, C1, SCEV::FlagNUW) && C1.ult(C2)) return true; break; } return false; } bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { if (Pred != ICmpInst::ICMP_ULT || ProvingSplitPredicate) return false; // Allowing arbitrary number of activations of isKnownPredicateViaSplitting on // the stack can result in exponential time complexity. SaveAndRestore Restore(ProvingSplitPredicate, true); // If L >= 0 then I `ult` L <=> I >= 0 && I `slt` L // // To prove L >= 0 we use isKnownNonNegative whereas to prove I >= 0 we use // isKnownPredicate. isKnownPredicate is more powerful, but also more // expensive; and using isKnownNonNegative(RHS) is sufficient for most of the // interesting cases seen in practice. We can consider "upgrading" L >= 0 to // use isKnownPredicate later if needed. return isKnownNonNegative(RHS) && isKnownPredicate(CmpInst::ICMP_SGE, LHS, getZero(LHS->getType())) && isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS); } bool ScalarEvolution::isImpliedViaGuard(const BasicBlock *BB, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // No need to even try if we know the module has no guards. if (!HasGuards) return false; return any_of(*BB, [&](const Instruction &I) { using namespace llvm::PatternMatch; Value *Condition; return match(&I, m_Intrinsic( m_Value(Condition))) && isImpliedCond(Pred, LHS, RHS, Condition, false); }); } /// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is /// protected by a conditional between LHS and RHS. This is used to /// to eliminate casts. bool ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Interpret a null as meaning no loop, where there is obviously no guard // (interprocedural conditions notwithstanding). Do not bother about // unreachable loops. if (!L || !DT.isReachableFromEntry(L->getHeader())) return true; if (VerifyIR) assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) && "This cannot be done on broken IR!"); if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS)) return true; BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return false; BranchInst *LoopContinuePredicate = dyn_cast(Latch->getTerminator()); if (LoopContinuePredicate && LoopContinuePredicate->isConditional() && isImpliedCond(Pred, LHS, RHS, LoopContinuePredicate->getCondition(), LoopContinuePredicate->getSuccessor(0) != L->getHeader())) return true; // We don't want more than one activation of the following loops on the stack // -- that can lead to O(n!) time complexity. if (WalkingBEDominatingConds) return false; SaveAndRestore ClearOnExit(WalkingBEDominatingConds, true); // See if we can exploit a trip count to prove the predicate. const auto &BETakenInfo = getBackedgeTakenInfo(L); const SCEV *LatchBECount = BETakenInfo.getExact(Latch, this); if (LatchBECount != getCouldNotCompute()) { // We know that Latch branches back to the loop header exactly // LatchBECount times. This means the backdege condition at Latch is // equivalent to "{0,+,1} u< LatchBECount". Type *Ty = LatchBECount->getType(); auto NoWrapFlags = SCEV::NoWrapFlags(SCEV::FlagNUW | SCEV::FlagNW); const SCEV *LoopCounter = getAddRecExpr(getZero(Ty), getOne(Ty), L, NoWrapFlags); if (isImpliedCond(Pred, LHS, RHS, ICmpInst::ICMP_ULT, LoopCounter, LatchBECount)) return true; } // Check conditions due to any @llvm.assume intrinsics. for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *CI = cast(AssumeVH); if (!DT.dominates(CI, Latch->getTerminator())) continue; if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false)) return true; } if (isImpliedViaGuard(Latch, Pred, LHS, RHS)) return true; for (DomTreeNode *DTN = DT[Latch], *HeaderDTN = DT[L->getHeader()]; DTN != HeaderDTN; DTN = DTN->getIDom()) { assert(DTN && "should reach the loop header before reaching the root!"); BasicBlock *BB = DTN->getBlock(); if (isImpliedViaGuard(BB, Pred, LHS, RHS)) return true; BasicBlock *PBB = BB->getSinglePredecessor(); if (!PBB) continue; BranchInst *ContinuePredicate = dyn_cast(PBB->getTerminator()); if (!ContinuePredicate || !ContinuePredicate->isConditional()) continue; Value *Condition = ContinuePredicate->getCondition(); // If we have an edge `E` within the loop body that dominates the only // latch, the condition guarding `E` also guards the backedge. This // reasoning works only for loops with a single latch. BasicBlockEdge DominatingEdge(PBB, BB); if (DominatingEdge.isSingleEdge()) { // We're constructively (and conservatively) enumerating edges within the // loop body that dominate the latch. The dominator tree better agree // with us on this: assert(DT.dominates(DominatingEdge, Latch) && "should be!"); if (isImpliedCond(Pred, LHS, RHS, Condition, BB != ContinuePredicate->getSuccessor(0))) return true; } } return false; } bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Do not bother proving facts for unreachable code. if (!DT.isReachableFromEntry(BB)) return true; if (VerifyIR) assert(!verifyFunction(*BB->getParent(), &dbgs()) && "This cannot be done on broken IR!"); // If we cannot prove strict comparison (e.g. a > b), maybe we can prove // the facts (a >= b && a != b) separately. A typical situation is when the // non-strict comparison is known from ranges and non-equality is known from // dominating predicates. If we are proving strict comparison, we always try // to prove non-equality and non-strict comparison separately. auto NonStrictPredicate = ICmpInst::getNonStrictPredicate(Pred); const bool ProvingStrictComparison = (Pred != NonStrictPredicate); bool ProvedNonStrictComparison = false; bool ProvedNonEquality = false; auto SplitAndProve = [&](std::function Fn) -> bool { if (!ProvedNonStrictComparison) ProvedNonStrictComparison = Fn(NonStrictPredicate); if (!ProvedNonEquality) ProvedNonEquality = Fn(ICmpInst::ICMP_NE); if (ProvedNonStrictComparison && ProvedNonEquality) return true; return false; }; if (ProvingStrictComparison) { auto ProofFn = [&](ICmpInst::Predicate P) { return isKnownViaNonRecursiveReasoning(P, LHS, RHS); }; if (SplitAndProve(ProofFn)) return true; } // Try to prove (Pred, LHS, RHS) using isImpliedCond. auto ProveViaCond = [&](const Value *Condition, bool Inverse) { const Instruction *CtxI = &BB->front(); if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, CtxI)) return true; if (ProvingStrictComparison) { auto ProofFn = [&](ICmpInst::Predicate P) { return isImpliedCond(P, LHS, RHS, Condition, Inverse, CtxI); }; if (SplitAndProve(ProofFn)) return true; } return false; }; // Starting at the block's predecessor, climb up the predecessor chain, as long // as there are predecessors that can be found that have unique successors // leading to the original block. const Loop *ContainingLoop = LI.getLoopFor(BB); const BasicBlock *PredBB; if (ContainingLoop && ContainingLoop->getHeader() == BB) PredBB = ContainingLoop->getLoopPredecessor(); else PredBB = BB->getSinglePredecessor(); for (std::pair Pair(PredBB, BB); Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { const BranchInst *BlockEntryPredicate = dyn_cast(Pair.first->getTerminator()); if (!BlockEntryPredicate || BlockEntryPredicate->isUnconditional()) continue; if (ProveViaCond(BlockEntryPredicate->getCondition(), BlockEntryPredicate->getSuccessor(0) != Pair.second)) return true; } // Check conditions due to any @llvm.assume intrinsics. for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *CI = cast(AssumeVH); if (!DT.dominates(CI, BB)) continue; if (ProveViaCond(CI->getArgOperand(0), false)) return true; } // Check conditions due to any @llvm.experimental.guard intrinsics. auto *GuardDecl = F.getParent()->getFunction( Intrinsic::getName(Intrinsic::experimental_guard)); if (GuardDecl) for (const auto *GU : GuardDecl->users()) if (const auto *Guard = dyn_cast(GU)) if (Guard->getFunction() == BB->getParent() && DT.dominates(Guard, BB)) if (ProveViaCond(Guard->getArgOperand(0), false)) return true; return false; } bool ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Interpret a null as meaning no loop, where there is obviously no guard // (interprocedural conditions notwithstanding). if (!L) return false; // Both LHS and RHS must be available at loop entry. assert(isAvailableAtLoopEntry(LHS, L) && "LHS is not available at Loop Entry"); assert(isAvailableAtLoopEntry(RHS, L) && "RHS is not available at Loop Entry"); if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS)) return true; return isBasicBlockEntryGuardedByCond(L->getHeader(), Pred, LHS, RHS); } bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Value *FoundCondValue, bool Inverse, const Instruction *CtxI) { // False conditions implies anything. Do not bother analyzing it further. if (FoundCondValue == ConstantInt::getBool(FoundCondValue->getContext(), Inverse)) return true; if (!PendingLoopPredicates.insert(FoundCondValue).second) return false; auto ClearOnExit = make_scope_exit([&]() { PendingLoopPredicates.erase(FoundCondValue); }); // Recursively handle And and Or conditions. const Value *Op0, *Op1; if (match(FoundCondValue, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) { if (!Inverse) return isImpliedCond(Pred, LHS, RHS, Op0, Inverse, CtxI) || isImpliedCond(Pred, LHS, RHS, Op1, Inverse, CtxI); } else if (match(FoundCondValue, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) { if (Inverse) return isImpliedCond(Pred, LHS, RHS, Op0, Inverse, CtxI) || isImpliedCond(Pred, LHS, RHS, Op1, Inverse, CtxI); } const ICmpInst *ICI = dyn_cast(FoundCondValue); if (!ICI) return false; // Now that we found a conditional branch that dominates the loop or controls // the loop latch. Check to see if it is the comparison we are looking for. ICmpInst::Predicate FoundPred; if (Inverse) FoundPred = ICI->getInversePredicate(); else FoundPred = ICI->getPredicate(); const SCEV *FoundLHS = getSCEV(ICI->getOperand(0)); const SCEV *FoundRHS = getSCEV(ICI->getOperand(1)); return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, CtxI); } bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *CtxI) { // Balance the types. if (getTypeSizeInBits(LHS->getType()) < getTypeSizeInBits(FoundLHS->getType())) { // For unsigned and equality predicates, try to prove that both found // operands fit into narrow unsigned range. If so, try to prove facts in // narrow types. if (!CmpInst::isSigned(FoundPred) && !FoundLHS->getType()->isPointerTy() && !FoundRHS->getType()->isPointerTy()) { auto *NarrowType = LHS->getType(); auto *WideType = FoundLHS->getType(); auto BitWidth = getTypeSizeInBits(NarrowType); const SCEV *MaxValue = getZeroExtendExpr( getConstant(APInt::getMaxValue(BitWidth)), WideType); if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, FoundLHS, MaxValue) && isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, FoundRHS, MaxValue)) { const SCEV *TruncFoundLHS = getTruncateExpr(FoundLHS, NarrowType); const SCEV *TruncFoundRHS = getTruncateExpr(FoundRHS, NarrowType); if (isImpliedCondBalancedTypes(Pred, LHS, RHS, FoundPred, TruncFoundLHS, TruncFoundRHS, CtxI)) return true; } } if (LHS->getType()->isPointerTy() || RHS->getType()->isPointerTy()) return false; if (CmpInst::isSigned(Pred)) { LHS = getSignExtendExpr(LHS, FoundLHS->getType()); RHS = getSignExtendExpr(RHS, FoundLHS->getType()); } else { LHS = getZeroExtendExpr(LHS, FoundLHS->getType()); RHS = getZeroExtendExpr(RHS, FoundLHS->getType()); } } else if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(FoundLHS->getType())) { if (FoundLHS->getType()->isPointerTy() || FoundRHS->getType()->isPointerTy()) return false; if (CmpInst::isSigned(FoundPred)) { FoundLHS = getSignExtendExpr(FoundLHS, LHS->getType()); FoundRHS = getSignExtendExpr(FoundRHS, LHS->getType()); } else { FoundLHS = getZeroExtendExpr(FoundLHS, LHS->getType()); FoundRHS = getZeroExtendExpr(FoundRHS, LHS->getType()); } } return isImpliedCondBalancedTypes(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, CtxI); } bool ScalarEvolution::isImpliedCondBalancedTypes( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *CtxI) { assert(getTypeSizeInBits(LHS->getType()) == getTypeSizeInBits(FoundLHS->getType()) && "Types should be balanced!"); // Canonicalize the query to match the way instcombine will have // canonicalized the comparison. if (SimplifyICmpOperands(Pred, LHS, RHS)) if (LHS == RHS) return CmpInst::isTrueWhenEqual(Pred); if (SimplifyICmpOperands(FoundPred, FoundLHS, FoundRHS)) if (FoundLHS == FoundRHS) return CmpInst::isFalseWhenEqual(FoundPred); // Check to see if we can make the LHS or RHS match. if (LHS == FoundRHS || RHS == FoundLHS) { if (isa(RHS)) { std::swap(FoundLHS, FoundRHS); FoundPred = ICmpInst::getSwappedPredicate(FoundPred); } else { std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); } } // Check whether the found predicate is the same as the desired predicate. if (FoundPred == Pred) return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI); // Check whether swapping the found predicate makes it the same as the // desired predicate. if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) { // We can write the implication // 0. LHS Pred RHS <- FoundLHS SwapPred FoundRHS // using one of the following ways: // 1. LHS Pred RHS <- FoundRHS Pred FoundLHS // 2. RHS SwapPred LHS <- FoundLHS SwapPred FoundRHS // 3. LHS Pred RHS <- ~FoundLHS Pred ~FoundRHS // 4. ~LHS SwapPred ~RHS <- FoundLHS SwapPred FoundRHS // Forms 1. and 2. require swapping the operands of one condition. Don't // do this if it would break canonical constant/addrec ordering. if (!isa(RHS) && !isa(LHS)) return isImpliedCondOperands(FoundPred, RHS, LHS, FoundLHS, FoundRHS, CtxI); if (!isa(FoundRHS) && !isa(FoundLHS)) return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, CtxI); // There's no clear preference between forms 3. and 4., try both. Avoid // forming getNotSCEV of pointer values as the resulting subtract is // not legal. if (!LHS->getType()->isPointerTy() && !RHS->getType()->isPointerTy() && isImpliedCondOperands(FoundPred, getNotSCEV(LHS), getNotSCEV(RHS), FoundLHS, FoundRHS, CtxI)) return true; if (!FoundLHS->getType()->isPointerTy() && !FoundRHS->getType()->isPointerTy() && isImpliedCondOperands(Pred, LHS, RHS, getNotSCEV(FoundLHS), getNotSCEV(FoundRHS), CtxI)) return true; return false; } auto IsSignFlippedPredicate = [](CmpInst::Predicate P1, CmpInst::Predicate P2) { assert(P1 != P2 && "Handled earlier!"); return CmpInst::isRelational(P2) && P1 == CmpInst::getFlippedSignednessPredicate(P2); }; if (IsSignFlippedPredicate(Pred, FoundPred)) { // Unsigned comparison is the same as signed comparison when both the // operands are non-negative or negative. if ((isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS)) || (isKnownNegative(FoundLHS) && isKnownNegative(FoundRHS))) return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI); // Create local copies that we can freely swap and canonicalize our // conditions to "le/lt". ICmpInst::Predicate CanonicalPred = Pred, CanonicalFoundPred = FoundPred; const SCEV *CanonicalLHS = LHS, *CanonicalRHS = RHS, *CanonicalFoundLHS = FoundLHS, *CanonicalFoundRHS = FoundRHS; if (ICmpInst::isGT(CanonicalPred) || ICmpInst::isGE(CanonicalPred)) { CanonicalPred = ICmpInst::getSwappedPredicate(CanonicalPred); CanonicalFoundPred = ICmpInst::getSwappedPredicate(CanonicalFoundPred); std::swap(CanonicalLHS, CanonicalRHS); std::swap(CanonicalFoundLHS, CanonicalFoundRHS); } assert((ICmpInst::isLT(CanonicalPred) || ICmpInst::isLE(CanonicalPred)) && "Must be!"); assert((ICmpInst::isLT(CanonicalFoundPred) || ICmpInst::isLE(CanonicalFoundPred)) && "Must be!"); if (ICmpInst::isSigned(CanonicalPred) && isKnownNonNegative(CanonicalRHS)) // Use implication: // x =s 0 --> x x (FoundLHS) || isa(FoundRHS))) { const SCEVConstant *C = nullptr; const SCEV *V = nullptr; if (isa(FoundLHS)) { C = cast(FoundLHS); V = FoundRHS; } else { C = cast(FoundRHS); V = FoundLHS; } // The guarding predicate tells us that C != V. If the known range // of V is [C, t), we can sharpen the range to [C + 1, t). The // range we consider has to correspond to same signedness as the // predicate we're interested in folding. APInt Min = ICmpInst::isSigned(Pred) ? getSignedRangeMin(V) : getUnsignedRangeMin(V); if (Min == C->getAPInt()) { // Given (V >= Min && V != Min) we conclude V >= (Min + 1). // This is true even if (Min + 1) wraps around -- in case of // wraparound, (Min + 1) < Min, so (V >= Min => V >= (Min + 1)). APInt SharperMin = Min + 1; switch (Pred) { case ICmpInst::ICMP_SGE: case ICmpInst::ICMP_UGE: // We know V `Pred` SharperMin. If this implies LHS `Pred` // RHS, we're done. if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin), CtxI)) return true; [[fallthrough]]; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_UGT: // We know from the range information that (V `Pred` Min || // V == Min). We know from the guarding condition that !(V // == Min). This gives us // // V `Pred` Min || V == Min && !(V == Min) // => V `Pred` Min // // If V `Pred` Min implies LHS `Pred` RHS, we're done. if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min), CtxI)) return true; break; // `LHS < RHS` and `LHS <= RHS` are handled in the same way as `RHS > LHS` and `RHS >= LHS` respectively. case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_ULE: if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS, LHS, V, getConstant(SharperMin), CtxI)) return true; [[fallthrough]]; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_ULT: if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS, LHS, V, getConstant(Min), CtxI)) return true; break; default: // No change break; } } } // Check whether the actual condition is beyond sufficient. if (FoundPred == ICmpInst::ICMP_EQ) if (ICmpInst::isTrueWhenEqual(Pred)) if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI)) return true; if (Pred == ICmpInst::ICMP_NE) if (!ICmpInst::isTrueWhenEqual(FoundPred)) if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS, CtxI)) return true; // Otherwise assume the worst. return false; } bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R, SCEV::NoWrapFlags &Flags) { const auto *AE = dyn_cast(Expr); if (!AE || AE->getNumOperands() != 2) return false; L = AE->getOperand(0); R = AE->getOperand(1); Flags = AE->getNoWrapFlags(); return true; } std::optional ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) { // We avoid subtracting expressions here because this function is usually // fairly deep in the call stack (i.e. is called many times). // X - X = 0. if (More == Less) return APInt(getTypeSizeInBits(More->getType()), 0); if (isa(Less) && isa(More)) { const auto *LAR = cast(Less); const auto *MAR = cast(More); if (LAR->getLoop() != MAR->getLoop()) return std::nullopt; // We look at affine expressions only; not for correctness but to keep // getStepRecurrence cheap. if (!LAR->isAffine() || !MAR->isAffine()) return std::nullopt; if (LAR->getStepRecurrence(*this) != MAR->getStepRecurrence(*this)) return std::nullopt; Less = LAR->getStart(); More = MAR->getStart(); // fall through } if (isa(Less) && isa(More)) { const auto &M = cast(More)->getAPInt(); const auto &L = cast(Less)->getAPInt(); return M - L; } SCEV::NoWrapFlags Flags; const SCEV *LLess = nullptr, *RLess = nullptr; const SCEV *LMore = nullptr, *RMore = nullptr; const SCEVConstant *C1 = nullptr, *C2 = nullptr; // Compare (X + C1) vs X. if (splitBinaryAdd(Less, LLess, RLess, Flags)) if ((C1 = dyn_cast(LLess))) if (RLess == More) return -(C1->getAPInt()); // Compare X vs (X + C2). if (splitBinaryAdd(More, LMore, RMore, Flags)) if ((C2 = dyn_cast(LMore))) if (RMore == Less) return C2->getAPInt(); // Compare (X + C1) vs (X + C2). if (C1 && C2 && RLess == RMore) return C2->getAPInt() - C1->getAPInt(); return std::nullopt; } bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *CtxI) { // Try to recognize the following pattern: // // FoundRHS = ... // ... // loop: // FoundLHS = {Start,+,W} // context_bb: // Basic block from the same loop // known(Pred, FoundLHS, FoundRHS) // // If some predicate is known in the context of a loop, it is also known on // each iteration of this loop, including the first iteration. Therefore, in // this case, `FoundLHS Pred FoundRHS` implies `Start Pred FoundRHS`. Try to // prove the original pred using this fact. if (!CtxI) return false; const BasicBlock *ContextBB = CtxI->getParent(); // Make sure AR varies in the context block. if (auto *AR = dyn_cast(FoundLHS)) { const Loop *L = AR->getLoop(); // Make sure that context belongs to the loop and executes on 1st iteration // (if it ever executes at all). if (!L->contains(ContextBB) || !DT.dominates(ContextBB, L->getLoopLatch())) return false; if (!isAvailableAtLoopEntry(FoundRHS, AR->getLoop())) return false; return isImpliedCondOperands(Pred, LHS, RHS, AR->getStart(), FoundRHS); } if (auto *AR = dyn_cast(FoundRHS)) { const Loop *L = AR->getLoop(); // Make sure that context belongs to the loop and executes on 1st iteration // (if it ever executes at all). if (!L->contains(ContextBB) || !DT.dominates(ContextBB, L->getLoopLatch())) return false; if (!isAvailableAtLoopEntry(FoundLHS, AR->getLoop())) return false; return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, AR->getStart()); } return false; } bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_ULT) return false; const auto *AddRecLHS = dyn_cast(LHS); if (!AddRecLHS) return false; const auto *AddRecFoundLHS = dyn_cast(FoundLHS); if (!AddRecFoundLHS) return false; // We'd like to let SCEV reason about control dependencies, so we constrain // both the inequalities to be about add recurrences on the same loop. This // way we can use isLoopEntryGuardedByCond later. const Loop *L = AddRecFoundLHS->getLoop(); if (L != AddRecLHS->getLoop()) return false; // FoundLHS u< FoundRHS u< -C => (FoundLHS + C) u< (FoundRHS + C) ... (1) // // FoundLHS s< FoundRHS s< INT_MIN - C => (FoundLHS + C) s< (FoundRHS + C) // ... (2) // // Informal proof for (2), assuming (1) [*]: // // We'll also assume (A s< B) <=> ((A + INT_MIN) u< (B + INT_MIN)) ... (3)[**] // // Then // // FoundLHS s< FoundRHS s< INT_MIN - C // <=> (FoundLHS + INT_MIN) u< (FoundRHS + INT_MIN) u< -C [ using (3) ] // <=> (FoundLHS + INT_MIN + C) u< (FoundRHS + INT_MIN + C) [ using (1) ] // <=> (FoundLHS + INT_MIN + C + INT_MIN) s< // (FoundRHS + INT_MIN + C + INT_MIN) [ using (3) ] // <=> FoundLHS + C s< FoundRHS + C // // [*]: (1) can be proved by ruling out overflow. // // [**]: This can be proved by analyzing all the four possibilities: // (A s< 0, B s< 0), (A s< 0, B s>= 0), (A s>= 0, B s< 0) and // (A s>= 0, B s>= 0). // // Note: // Despite (2), "FoundRHS s< INT_MIN - C" does not mean that "FoundRHS + C" // will not sign underflow. For instance, say FoundLHS = (i8 -128), FoundRHS // = (i8 -127) and C = (i8 -100). Then INT_MIN - C = (i8 -28), and FoundRHS // s< (INT_MIN - C). Lack of sign overflow / underflow in "FoundRHS + C" is // neither necessary nor sufficient to prove "(FoundLHS + C) s< (FoundRHS + // C)". std::optional LDiff = computeConstantDifference(LHS, FoundLHS); std::optional RDiff = computeConstantDifference(RHS, FoundRHS); if (!LDiff || !RDiff || *LDiff != *RDiff) return false; if (LDiff->isMinValue()) return true; APInt FoundRHSLimit; if (Pred == CmpInst::ICMP_ULT) { FoundRHSLimit = -(*RDiff); } else { assert(Pred == CmpInst::ICMP_SLT && "Checked above!"); FoundRHSLimit = APInt::getSignedMinValue(getTypeSizeInBits(RHS->getType())) - *RDiff; } // Try to prove (1) or (2), as needed. return isAvailableAtLoopEntry(FoundRHS, L) && isLoopEntryGuardedByCond(L, Pred, FoundRHS, getConstant(FoundRHSLimit)); } bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS, unsigned Depth) { const PHINode *LPhi = nullptr, *RPhi = nullptr; auto ClearOnExit = make_scope_exit([&]() { if (LPhi) { bool Erased = PendingMerges.erase(LPhi); assert(Erased && "Failed to erase LPhi!"); (void)Erased; } if (RPhi) { bool Erased = PendingMerges.erase(RPhi); assert(Erased && "Failed to erase RPhi!"); (void)Erased; } }); // Find respective Phis and check that they are not being pending. if (const SCEVUnknown *LU = dyn_cast(LHS)) if (auto *Phi = dyn_cast(LU->getValue())) { if (!PendingMerges.insert(Phi).second) return false; LPhi = Phi; } if (const SCEVUnknown *RU = dyn_cast(RHS)) if (auto *Phi = dyn_cast(RU->getValue())) { // If we detect a loop of Phi nodes being processed by this method, for // example: // // %a = phi i32 [ %some1, %preheader ], [ %b, %latch ] // %b = phi i32 [ %some2, %preheader ], [ %a, %latch ] // // we don't want to deal with a case that complex, so return conservative // answer false. if (!PendingMerges.insert(Phi).second) return false; RPhi = Phi; } // If none of LHS, RHS is a Phi, nothing to do here. if (!LPhi && !RPhi) return false; // If there is a SCEVUnknown Phi we are interested in, make it left. if (!LPhi) { std::swap(LHS, RHS); std::swap(FoundLHS, FoundRHS); std::swap(LPhi, RPhi); Pred = ICmpInst::getSwappedPredicate(Pred); } assert(LPhi && "LPhi should definitely be a SCEVUnknown Phi!"); const BasicBlock *LBB = LPhi->getParent(); const SCEVAddRecExpr *RAR = dyn_cast(RHS); auto ProvedEasily = [&](const SCEV *S1, const SCEV *S2) { return isKnownViaNonRecursiveReasoning(Pred, S1, S2) || isImpliedCondOperandsViaRanges(Pred, S1, S2, FoundLHS, FoundRHS) || isImpliedViaOperations(Pred, S1, S2, FoundLHS, FoundRHS, Depth); }; if (RPhi && RPhi->getParent() == LBB) { // Case one: RHS is also a SCEVUnknown Phi from the same basic block. // If we compare two Phis from the same block, and for each entry block // the predicate is true for incoming values from this block, then the // predicate is also true for the Phis. for (const BasicBlock *IncBB : predecessors(LBB)) { const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB)); const SCEV *R = getSCEV(RPhi->getIncomingValueForBlock(IncBB)); if (!ProvedEasily(L, R)) return false; } } else if (RAR && RAR->getLoop()->getHeader() == LBB) { // Case two: RHS is also a Phi from the same basic block, and it is an // AddRec. It means that there is a loop which has both AddRec and Unknown // PHIs, for it we can compare incoming values of AddRec from above the loop // and latch with their respective incoming values of LPhi. // TODO: Generalize to handle loops with many inputs in a header. if (LPhi->getNumIncomingValues() != 2) return false; auto *RLoop = RAR->getLoop(); auto *Predecessor = RLoop->getLoopPredecessor(); assert(Predecessor && "Loop with AddRec with no predecessor?"); const SCEV *L1 = getSCEV(LPhi->getIncomingValueForBlock(Predecessor)); if (!ProvedEasily(L1, RAR->getStart())) return false; auto *Latch = RLoop->getLoopLatch(); assert(Latch && "Loop with AddRec with no latch?"); const SCEV *L2 = getSCEV(LPhi->getIncomingValueForBlock(Latch)); if (!ProvedEasily(L2, RAR->getPostIncExpr(*this))) return false; } else { // In all other cases go over inputs of LHS and compare each of them to RHS, // the predicate is true for (LHS, RHS) if it is true for all such pairs. // At this point RHS is either a non-Phi, or it is a Phi from some block // different from LBB. for (const BasicBlock *IncBB : predecessors(LBB)) { // Check that RHS is available in this block. if (!dominates(RHS, IncBB)) return false; const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB)); // Make sure L does not refer to a value from a potentially previous // iteration of a loop. if (!properlyDominates(L, LBB)) return false; if (!ProvedEasily(L, RHS)) return false; } } return true; } bool ScalarEvolution::isImpliedCondOperandsViaShift(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { // We want to imply LHS < RHS from LHS < (RHS >> shiftvalue). First, make // sure that we are dealing with same LHS. if (RHS == FoundRHS) { std::swap(LHS, RHS); std::swap(FoundLHS, FoundRHS); Pred = ICmpInst::getSwappedPredicate(Pred); } if (LHS != FoundLHS) return false; auto *SUFoundRHS = dyn_cast(FoundRHS); if (!SUFoundRHS) return false; Value *Shiftee, *ShiftValue; using namespace PatternMatch; if (match(SUFoundRHS->getValue(), m_LShr(m_Value(Shiftee), m_Value(ShiftValue)))) { auto *ShifteeS = getSCEV(Shiftee); // Prove one of the following: // LHS > shiftvalue) && shiftee <=u RHS ---> LHS > shiftvalue) && shiftee <=u RHS ---> LHS <=u RHS // LHS > shiftvalue) && shiftee <=s RHS && shiftee >=s 0 // ---> LHS > shiftvalue) && shiftee <=s RHS && shiftee >=s 0 // ---> LHS <=s RHS if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) return isKnownPredicate(ICmpInst::ICMP_ULE, ShifteeS, RHS); if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) if (isKnownNonNegative(ShifteeS)) return isKnownPredicate(ICmpInst::ICMP_SLE, ShifteeS, RHS); } return false; } bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *CtxI) { if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; if (isImpliedCondOperandsViaShift(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI)) return true; return isImpliedCondOperandsHelper(Pred, LHS, RHS, FoundLHS, FoundRHS); } /// Is MaybeMinMaxExpr an (U|S)(Min|Max) of Candidate and some other values? template static bool IsMinMaxConsistingOf(const SCEV *MaybeMinMaxExpr, const SCEV *Candidate) { const MinMaxExprType *MinMaxExpr = dyn_cast(MaybeMinMaxExpr); if (!MinMaxExpr) return false; return is_contained(MinMaxExpr->operands(), Candidate); } static bool IsKnownPredicateViaAddRecStart(ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // If both sides are affine addrecs for the same loop, with equal // steps, and we know the recurrences don't wrap, then we only // need to check the predicate on the starting values. if (!ICmpInst::isRelational(Pred)) return false; const SCEVAddRecExpr *LAR = dyn_cast(LHS); if (!LAR) return false; const SCEVAddRecExpr *RAR = dyn_cast(RHS); if (!RAR) return false; if (LAR->getLoop() != RAR->getLoop()) return false; if (!LAR->isAffine() || !RAR->isAffine()) return false; if (LAR->getStepRecurrence(SE) != RAR->getStepRecurrence(SE)) return false; SCEV::NoWrapFlags NW = ICmpInst::isSigned(Pred) ? SCEV::FlagNSW : SCEV::FlagNUW; if (!LAR->getNoWrapFlags(NW) || !RAR->getNoWrapFlags(NW)) return false; return SE.isKnownPredicate(Pred, LAR->getStart(), RAR->getStart()); } /// Is LHS `Pred` RHS true on the virtue of LHS or RHS being a Min or Max /// expression? static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { switch (Pred) { default: return false; case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_SLE: return // min(A, ...) <= A IsMinMaxConsistingOf(LHS, RHS) || // A <= max(A, ...) IsMinMaxConsistingOf(RHS, LHS); case ICmpInst::ICMP_UGE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_ULE: return // min(A, ...) <= A // FIXME: what about umin_seq? IsMinMaxConsistingOf(LHS, RHS) || // A <= max(A, ...) IsMinMaxConsistingOf(RHS, LHS); } llvm_unreachable("covered switch fell through?!"); } bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS, unsigned Depth) { assert(getTypeSizeInBits(LHS->getType()) == getTypeSizeInBits(RHS->getType()) && "LHS and RHS have different sizes?"); assert(getTypeSizeInBits(FoundLHS->getType()) == getTypeSizeInBits(FoundRHS->getType()) && "FoundLHS and FoundRHS have different sizes?"); // We want to avoid hurting the compile time with analysis of too big trees. if (Depth > MaxSCEVOperationsImplicationDepth) return false; // We only want to work with GT comparison so far. if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) { Pred = CmpInst::getSwappedPredicate(Pred); std::swap(LHS, RHS); std::swap(FoundLHS, FoundRHS); } // For unsigned, try to reduce it to corresponding signed comparison. if (Pred == ICmpInst::ICMP_UGT) // We can replace unsigned predicate with its signed counterpart if all // involved values are non-negative. // TODO: We could have better support for unsigned. if (isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS)) { // Knowing that both FoundLHS and FoundRHS are non-negative, and knowing // FoundLHS >u FoundRHS, we also know that FoundLHS >s FoundRHS. Let us // use this fact to prove that LHS and RHS are non-negative. const SCEV *MinusOne = getMinusOne(LHS->getType()); if (isImpliedCondOperands(ICmpInst::ICMP_SGT, LHS, MinusOne, FoundLHS, FoundRHS) && isImpliedCondOperands(ICmpInst::ICMP_SGT, RHS, MinusOne, FoundLHS, FoundRHS)) Pred = ICmpInst::ICMP_SGT; } if (Pred != ICmpInst::ICMP_SGT) return false; auto GetOpFromSExt = [&](const SCEV *S) { if (auto *Ext = dyn_cast(S)) return Ext->getOperand(); // TODO: If S is a SCEVConstant then you can cheaply "strip" the sext off // the constant in some cases. return S; }; // Acquire values from extensions. auto *OrigLHS = LHS; auto *OrigFoundLHS = FoundLHS; LHS = GetOpFromSExt(LHS); FoundLHS = GetOpFromSExt(FoundLHS); // Is the SGT predicate can be proved trivially or using the found context. auto IsSGTViaContext = [&](const SCEV *S1, const SCEV *S2) { return isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SGT, S1, S2) || isImpliedViaOperations(ICmpInst::ICMP_SGT, S1, S2, OrigFoundLHS, FoundRHS, Depth + 1); }; if (auto *LHSAddExpr = dyn_cast(LHS)) { // We want to avoid creation of any new non-constant SCEV. Since we are // going to compare the operands to RHS, we should be certain that we don't // need any size extensions for this. So let's decline all cases when the // sizes of types of LHS and RHS do not match. // TODO: Maybe try to get RHS from sext to catch more cases? if (getTypeSizeInBits(LHS->getType()) != getTypeSizeInBits(RHS->getType())) return false; // Should not overflow. if (!LHSAddExpr->hasNoSignedWrap()) return false; auto *LL = LHSAddExpr->getOperand(0); auto *LR = LHSAddExpr->getOperand(1); auto *MinusOne = getMinusOne(RHS->getType()); // Checks that S1 >= 0 && S2 > RHS, trivially or using the found context. auto IsSumGreaterThanRHS = [&](const SCEV *S1, const SCEV *S2) { return IsSGTViaContext(S1, MinusOne) && IsSGTViaContext(S2, RHS); }; // Try to prove the following rule: // (LHS = LL + LR) && (LL >= 0) && (LR > RHS) => (LHS > RHS). // (LHS = LL + LR) && (LR >= 0) && (LL > RHS) => (LHS > RHS). if (IsSumGreaterThanRHS(LL, LR) || IsSumGreaterThanRHS(LR, LL)) return true; } else if (auto *LHSUnknownExpr = dyn_cast(LHS)) { Value *LL, *LR; // FIXME: Once we have SDiv implemented, we can get rid of this matching. using namespace llvm::PatternMatch; if (match(LHSUnknownExpr->getValue(), m_SDiv(m_Value(LL), m_Value(LR)))) { // Rules for division. // We are going to perform some comparisons with Denominator and its // derivative expressions. In general case, creating a SCEV for it may // lead to a complex analysis of the entire graph, and in particular it // can request trip count recalculation for the same loop. This would // cache as SCEVCouldNotCompute to avoid the infinite recursion. To avoid // this, we only want to create SCEVs that are constants in this section. // So we bail if Denominator is not a constant. if (!isa(LR)) return false; auto *Denominator = cast(getSCEV(LR)); // We want to make sure that LHS = FoundLHS / Denominator. If it is so, // then a SCEV for the numerator already exists and matches with FoundLHS. auto *Numerator = getExistingSCEV(LL); if (!Numerator || Numerator->getType() != FoundLHS->getType()) return false; // Make sure that the numerator matches with FoundLHS and the denominator // is positive. if (!HasSameValue(Numerator, FoundLHS) || !isKnownPositive(Denominator)) return false; auto *DTy = Denominator->getType(); auto *FRHSTy = FoundRHS->getType(); if (DTy->isPointerTy() != FRHSTy->isPointerTy()) // One of types is a pointer and another one is not. We cannot extend // them properly to a wider type, so let us just reject this case. // TODO: Usage of getEffectiveSCEVType for DTy, FRHSTy etc should help // to avoid this check. return false; // Given that: // FoundLHS > FoundRHS, LHS = FoundLHS / Denominator, Denominator > 0. auto *WTy = getWiderType(DTy, FRHSTy); auto *DenominatorExt = getNoopOrSignExtend(Denominator, WTy); auto *FoundRHSExt = getNoopOrSignExtend(FoundRHS, WTy); // Try to prove the following rule: // (FoundRHS > Denominator - 2) && (RHS <= 0) => (LHS > RHS). // For example, given that FoundLHS > 2. It means that FoundLHS is at // least 3. If we divide it by Denominator < 4, we will have at least 1. auto *DenomMinusTwo = getMinusSCEV(DenominatorExt, getConstant(WTy, 2)); if (isKnownNonPositive(RHS) && IsSGTViaContext(FoundRHSExt, DenomMinusTwo)) return true; // Try to prove the following rule: // (FoundRHS > -1 - Denominator) && (RHS < 0) => (LHS > RHS). // For example, given that FoundLHS > -3. Then FoundLHS is at least -2. // If we divide it by Denominator > 2, then: // 1. If FoundLHS is negative, then the result is 0. // 2. If FoundLHS is non-negative, then the result is non-negative. // Anyways, the result is non-negative. auto *MinusOne = getMinusOne(WTy); auto *NegDenomMinusOne = getMinusSCEV(MinusOne, DenominatorExt); if (isKnownNegative(RHS) && IsSGTViaContext(FoundRHSExt, NegDenomMinusOne)) return true; } } // If our expression contained SCEVUnknown Phis, and we split it down and now // need to prove something for them, try to prove the predicate for every // possible incoming values of those Phis. if (isImpliedViaMerge(Pred, OrigLHS, RHS, OrigFoundLHS, FoundRHS, Depth + 1)) return true; return false; } static bool isKnownPredicateExtendIdiom(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // zext x u<= sext x, sext x s<= zext x switch (Pred) { case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_SLE: { // If operand >=s 0 then ZExt == SExt. If operand (LHS); const SCEVZeroExtendExpr *ZExt = dyn_cast(RHS); if (SExt && ZExt && SExt->getOperand() == ZExt->getOperand()) return true; break; } case ICmpInst::ICMP_UGE: std::swap(LHS, RHS); [[fallthrough]]; case ICmpInst::ICMP_ULE: { // If operand >=s 0 then ZExt == SExt. If operand (LHS); const SCEVSignExtendExpr *SExt = dyn_cast(RHS); if (SExt && ZExt && SExt->getOperand() == ZExt->getOperand()) return true; break; } default: break; }; return false; } bool ScalarEvolution::isKnownViaNonRecursiveReasoning(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { return isKnownPredicateExtendIdiom(Pred, LHS, RHS) || isKnownPredicateViaConstantRanges(Pred, LHS, RHS) || IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) || IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) || isKnownPredicateViaNoOverflow(Pred, LHS, RHS); } bool ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { switch (Pred) { default: llvm_unreachable("Unexpected ICmpInst::Predicate value!"); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_NE: if (HasSameValue(LHS, FoundLHS) && HasSameValue(RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) && isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) && isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) && isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) && isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS)) return true; break; } // Maybe it can be proved via operations? if (isImpliedViaOperations(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; return false; } bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { if (!isa(RHS) || !isa(FoundRHS)) // The restriction on `FoundRHS` be lifted easily -- it exists only to // reduce the compile time impact of this optimization. return false; std::optional Addend = computeConstantDifference(LHS, FoundLHS); if (!Addend) return false; const APInt &ConstFoundRHS = cast(FoundRHS)->getAPInt(); // `FoundLHSRange` is the range we know `FoundLHS` to be in by virtue of the // antecedent "`FoundLHS` `Pred` `FoundRHS`". ConstantRange FoundLHSRange = ConstantRange::makeExactICmpRegion(Pred, ConstFoundRHS); // Since `LHS` is `FoundLHS` + `Addend`, we can compute a range for `LHS`: ConstantRange LHSRange = FoundLHSRange.add(ConstantRange(*Addend)); // We can also compute the range of values for `LHS` that satisfy the // consequent, "`LHS` `Pred` `RHS`": const APInt &ConstRHS = cast(RHS)->getAPInt(); // The antecedent implies the consequent if every value of `LHS` that // satisfies the antecedent also satisfies the consequent. return LHSRange.icmp(Pred, ConstRHS); } bool ScalarEvolution::canIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride, bool IsSigned) { assert(isKnownPositive(Stride) && "Positive stride expected!"); unsigned BitWidth = getTypeSizeInBits(RHS->getType()); const SCEV *One = getOne(Stride->getType()); if (IsSigned) { APInt MaxRHS = getSignedRangeMax(RHS); APInt MaxValue = APInt::getSignedMaxValue(BitWidth); APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One)); // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow! return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS); } APInt MaxRHS = getUnsignedRangeMax(RHS); APInt MaxValue = APInt::getMaxValue(BitWidth); APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One)); // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow! return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS); } bool ScalarEvolution::canIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, bool IsSigned) { unsigned BitWidth = getTypeSizeInBits(RHS->getType()); const SCEV *One = getOne(Stride->getType()); if (IsSigned) { APInt MinRHS = getSignedRangeMin(RHS); APInt MinValue = APInt::getSignedMinValue(BitWidth); APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One)); // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow! return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS); } APInt MinRHS = getUnsignedRangeMin(RHS); APInt MinValue = APInt::getMinValue(BitWidth); APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One)); // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow! return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS); } const SCEV *ScalarEvolution::getUDivCeilSCEV(const SCEV *N, const SCEV *D) { // umin(N, 1) + floor((N - umin(N, 1)) / D) // This is equivalent to "1 + floor((N - 1) / D)" for N != 0. The umin // expression fixes the case of N=0. const SCEV *MinNOne = getUMinExpr(N, getOne(N->getType())); const SCEV *NMinusOne = getMinusSCEV(N, MinNOne); return getAddExpr(MinNOne, getUDivExpr(NMinusOne, D)); } const SCEV *ScalarEvolution::computeMaxBECountForLT(const SCEV *Start, const SCEV *Stride, const SCEV *End, unsigned BitWidth, bool IsSigned) { // The logic in this function assumes we can represent a positive stride. // If we can't, the backedge-taken count must be zero. if (IsSigned && BitWidth == 1) return getZero(Stride->getType()); // This code below only been closely audited for negative strides in the // unsigned comparison case, it may be correct for signed comparison, but // that needs to be established. if (IsSigned && isKnownNegative(Stride)) return getCouldNotCompute(); // Calculate the maximum backedge count based on the range of values // permitted by Start, End, and Stride. APInt MinStart = IsSigned ? getSignedRangeMin(Start) : getUnsignedRangeMin(Start); APInt MinStride = IsSigned ? getSignedRangeMin(Stride) : getUnsignedRangeMin(Stride); // We assume either the stride is positive, or the backedge-taken count // is zero. So force StrideForMaxBECount to be at least one. APInt One(BitWidth, 1); APInt StrideForMaxBECount = IsSigned ? APIntOps::smax(One, MinStride) : APIntOps::umax(One, MinStride); APInt MaxValue = IsSigned ? APInt::getSignedMaxValue(BitWidth) : APInt::getMaxValue(BitWidth); APInt Limit = MaxValue - (StrideForMaxBECount - 1); // Although End can be a MAX expression we estimate MaxEnd considering only // the case End = RHS of the loop termination condition. This is safe because // in the other case (End - Start) is zero, leading to a zero maximum backedge // taken count. APInt MaxEnd = IsSigned ? APIntOps::smin(getSignedRangeMax(End), Limit) : APIntOps::umin(getUnsignedRangeMax(End), Limit); // MaxBECount = ceil((max(MaxEnd, MinStart) - MinStart) / Stride) MaxEnd = IsSigned ? APIntOps::smax(MaxEnd, MinStart) : APIntOps::umax(MaxEnd, MinStart); return getUDivCeilSCEV(getConstant(MaxEnd - MinStart) /* Delta */, getConstant(StrideForMaxBECount) /* Step */); } ScalarEvolution::ExitLimit ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned, bool ControlsOnlyExit, bool AllowPredicates) { SmallPtrSet Predicates; const SCEVAddRecExpr *IV = dyn_cast(LHS); bool PredicatedIV = false; auto canAssumeNoSelfWrap = [&](const SCEVAddRecExpr *AR) { // Can we prove this loop *must* be UB if overflow of IV occurs? // Reasoning goes as follows: // * Suppose the IV did self wrap. // * If Stride evenly divides the iteration space, then once wrap // occurs, the loop must revisit the same values. // * We know that RHS is invariant, and that none of those values // caused this exit to be taken previously. Thus, this exit is // dynamically dead. // * If this is the sole exit, then a dead exit implies the loop // must be infinite if there are no abnormal exits. // * If the loop were infinite, then it must either not be mustprogress // or have side effects. Otherwise, it must be UB. // * It can't (by assumption), be UB so we have contradicted our // premise and can conclude the IV did not in fact self-wrap. if (!isLoopInvariant(RHS, L)) return false; auto *StrideC = dyn_cast(AR->getStepRecurrence(*this)); if (!StrideC || !StrideC->getAPInt().isPowerOf2()) return false; if (!ControlsOnlyExit || !loopHasNoAbnormalExits(L)) return false; return loopIsFiniteByAssumption(L); }; if (!IV) { if (auto *ZExt = dyn_cast(LHS)) { const SCEVAddRecExpr *AR = dyn_cast(ZExt->getOperand()); if (AR && AR->getLoop() == L && AR->isAffine()) { auto canProveNUW = [&]() { if (!isLoopInvariant(RHS, L)) return false; if (!isKnownNonZero(AR->getStepRecurrence(*this))) // We need the sequence defined by AR to strictly increase in the // unsigned integer domain for the logic below to hold. return false; const unsigned InnerBitWidth = getTypeSizeInBits(AR->getType()); const unsigned OuterBitWidth = getTypeSizeInBits(RHS->getType()); // If RHS <=u Limit, then there must exist a value V in the sequence // defined by AR (e.g. {Start,+,Step}) such that V >u RHS, and // V <=u UINT_MAX. Thus, we must exit the loop before unsigned // overflow occurs. This limit also implies that a signed comparison // (in the wide bitwidth) is equivalent to an unsigned comparison as // the high bits on both sides must be zero. APInt StrideMax = getUnsignedRangeMax(AR->getStepRecurrence(*this)); APInt Limit = APInt::getMaxValue(InnerBitWidth) - (StrideMax - 1); Limit = Limit.zext(OuterBitWidth); return getUnsignedRangeMax(applyLoopGuards(RHS, L)).ule(Limit); }; auto Flags = AR->getNoWrapFlags(); if (!hasFlags(Flags, SCEV::FlagNUW) && canProveNUW()) Flags = setFlags(Flags, SCEV::FlagNUW); setNoWrapFlags(const_cast(AR), Flags); if (AR->hasNoUnsignedWrap()) { // Emulate what getZeroExtendExpr would have done during construction // if we'd been able to infer the fact just above at that time. const SCEV *Step = AR->getStepRecurrence(*this); Type *Ty = ZExt->getType(); auto *S = getAddRecExpr( getExtendAddRecStart(AR, Ty, this, 0), getZeroExtendExpr(Step, Ty, 0), L, AR->getNoWrapFlags()); IV = dyn_cast(S); } } } } if (!IV && AllowPredicates) { // Try to make this an AddRec using runtime tests, in the first X // iterations of this loop, where X is the SCEV expression found by the // algorithm below. IV = convertSCEVToAddRecWithPredicates(LHS, L, Predicates); PredicatedIV = true; } // Avoid weird loops if (!IV || IV->getLoop() != L || !IV->isAffine()) return getCouldNotCompute(); // A precondition of this method is that the condition being analyzed // reaches an exiting branch which dominates the latch. Given that, we can // assume that an increment which violates the nowrap specification and // produces poison must cause undefined behavior when the resulting poison // value is branched upon and thus we can conclude that the backedge is // taken no more often than would be required to produce that poison value. // Note that a well defined loop can exit on the iteration which violates // the nowrap specification if there is another exit (either explicit or // implicit/exceptional) which causes the loop to execute before the // exiting instruction we're analyzing would trigger UB. auto WrapType = IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW; bool NoWrap = ControlsOnlyExit && IV->getNoWrapFlags(WrapType); ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; const SCEV *Stride = IV->getStepRecurrence(*this); bool PositiveStride = isKnownPositive(Stride); // Avoid negative or zero stride values. if (!PositiveStride) { // We can compute the correct backedge taken count for loops with unknown // strides if we can prove that the loop is not an infinite loop with side // effects. Here's the loop structure we are trying to handle - // // i = start // do { // A[i] = i; // i += s; // } while (i < end); // // The backedge taken count for such loops is evaluated as - // (max(end, start + stride) - start - 1) /u stride // // The additional preconditions that we need to check to prove correctness // of the above formula is as follows - // // a) IV is either nuw or nsw depending upon signedness (indicated by the // NoWrap flag). // b) the loop is guaranteed to be finite (e.g. is mustprogress and has // no side effects within the loop) // c) loop has a single static exit (with no abnormal exits) // // Precondition a) implies that if the stride is negative, this is a single // trip loop. The backedge taken count formula reduces to zero in this case. // // Precondition b) and c) combine to imply that if rhs is invariant in L, // then a zero stride means the backedge can't be taken without executing // undefined behavior. // // The positive stride case is the same as isKnownPositive(Stride) returning // true (original behavior of the function). // if (PredicatedIV || !NoWrap || !loopIsFiniteByAssumption(L) || !loopHasNoAbnormalExits(L)) return getCouldNotCompute(); if (!isKnownNonZero(Stride)) { // If we have a step of zero, and RHS isn't invariant in L, we don't know // if it might eventually be greater than start and if so, on which // iteration. We can't even produce a useful upper bound. if (!isLoopInvariant(RHS, L)) return getCouldNotCompute(); // We allow a potentially zero stride, but we need to divide by stride // below. Since the loop can't be infinite and this check must control // the sole exit, we can infer the exit must be taken on the first // iteration (e.g. backedge count = 0) if the stride is zero. Given that, // we know the numerator in the divides below must be zero, so we can // pick an arbitrary non-zero value for the denominator (e.g. stride) // and produce the right result. // FIXME: Handle the case where Stride is poison? auto wouldZeroStrideBeUB = [&]() { // Proof by contradiction. Suppose the stride were zero. If we can // prove that the backedge *is* taken on the first iteration, then since // we know this condition controls the sole exit, we must have an // infinite loop. We can't have a (well defined) infinite loop per // check just above. // Note: The (Start - Stride) term is used to get the start' term from // (start' + stride,+,stride). Remember that we only care about the // result of this expression when stride == 0 at runtime. auto *StartIfZero = getMinusSCEV(IV->getStart(), Stride); return isLoopEntryGuardedByCond(L, Cond, StartIfZero, RHS); }; if (!wouldZeroStrideBeUB()) { Stride = getUMaxExpr(Stride, getOne(Stride->getType())); } } } else if (!Stride->isOne() && !NoWrap) { auto isUBOnWrap = [&]() { // From no-self-wrap, we need to then prove no-(un)signed-wrap. This // follows trivially from the fact that every (un)signed-wrapped, but // not self-wrapped value must be LT than the last value before // (un)signed wrap. Since we know that last value didn't exit, nor // will any smaller one. return canAssumeNoSelfWrap(IV); }; // Avoid proven overflow cases: this will ensure that the backedge taken // count will not generate any unsigned overflow. Relaxed no-overflow // conditions exploit NoWrapFlags, allowing to optimize in presence of // undefined behaviors like the case of C language. if (canIVOverflowOnLT(RHS, Stride, IsSigned) && !isUBOnWrap()) return getCouldNotCompute(); } // On all paths just preceeding, we established the following invariant: // IV can be assumed not to overflow up to and including the exiting // iteration. We proved this in one of two ways: // 1) We can show overflow doesn't occur before the exiting iteration // 1a) canIVOverflowOnLT, and b) step of one // 2) We can show that if overflow occurs, the loop must execute UB // before any possible exit. // Note that we have not yet proved RHS invariant (in general). const SCEV *Start = IV->getStart(); // Preserve pointer-typed Start/RHS to pass to isLoopEntryGuardedByCond. // If we convert to integers, isLoopEntryGuardedByCond will miss some cases. // Use integer-typed versions for actual computation; we can't subtract // pointers in general. const SCEV *OrigStart = Start; const SCEV *OrigRHS = RHS; if (Start->getType()->isPointerTy()) { Start = getLosslessPtrToIntExpr(Start); if (isa(Start)) return Start; } if (RHS->getType()->isPointerTy()) { RHS = getLosslessPtrToIntExpr(RHS); if (isa(RHS)) return RHS; } // When the RHS is not invariant, we do not know the end bound of the loop and // cannot calculate the ExactBECount needed by ExitLimit. However, we can // calculate the MaxBECount, given the start, stride and max value for the end // bound of the loop (RHS), and the fact that IV does not overflow (which is // checked above). if (!isLoopInvariant(RHS, L)) { const SCEV *MaxBECount = computeMaxBECountForLT( Start, Stride, RHS, getTypeSizeInBits(LHS->getType()), IsSigned); return ExitLimit(getCouldNotCompute() /* ExactNotTaken */, MaxBECount, MaxBECount, false /*MaxOrZero*/, Predicates); } // We use the expression (max(End,Start)-Start)/Stride to describe the // backedge count, as if the backedge is taken at least once max(End,Start) // is End and so the result is as above, and if not max(End,Start) is Start // so we get a backedge count of zero. const SCEV *BECount = nullptr; auto *OrigStartMinusStride = getMinusSCEV(OrigStart, Stride); assert(isAvailableAtLoopEntry(OrigStartMinusStride, L) && "Must be!"); assert(isAvailableAtLoopEntry(OrigStart, L) && "Must be!"); assert(isAvailableAtLoopEntry(OrigRHS, L) && "Must be!"); // Can we prove (max(RHS,Start) > Start - Stride? if (isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigStart) && isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigRHS)) { // In this case, we can use a refined formula for computing backedge taken // count. The general formula remains: // "End-Start /uceiling Stride" where "End = max(RHS,Start)" // We want to use the alternate formula: // "((End - 1) - (Start - Stride)) /u Stride" // Let's do a quick case analysis to show these are equivalent under // our precondition that max(RHS,Start) > Start - Stride. // * For RHS <= Start, the backedge-taken count must be zero. // "((End - 1) - (Start - Stride)) /u Stride" reduces to // "((Start - 1) - (Start - Stride)) /u Stride" which simplies to // "Stride - 1 /u Stride" which is indeed zero for all non-zero values // of Stride. For 0 stride, we've use umin(1,Stride) above, reducing // this to the stride of 1 case. // * For RHS >= Start, the backedge count must be "RHS-Start /uceil Stride". // "((End - 1) - (Start - Stride)) /u Stride" reduces to // "((RHS - 1) - (Start - Stride)) /u Stride" reassociates to // "((RHS - (Start - Stride) - 1) /u Stride". // Our preconditions trivially imply no overflow in that form. const SCEV *MinusOne = getMinusOne(Stride->getType()); const SCEV *Numerator = getMinusSCEV(getAddExpr(RHS, MinusOne), getMinusSCEV(Start, Stride)); BECount = getUDivExpr(Numerator, Stride); } const SCEV *BECountIfBackedgeTaken = nullptr; if (!BECount) { auto canProveRHSGreaterThanEqualStart = [&]() { auto CondGE = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; if (isLoopEntryGuardedByCond(L, CondGE, OrigRHS, OrigStart)) return true; // (RHS > Start - 1) implies RHS >= Start. // * "RHS >= Start" is trivially equivalent to "RHS > Start - 1" if // "Start - 1" doesn't overflow. // * For signed comparison, if Start - 1 does overflow, it's equal // to INT_MAX, and "RHS >s INT_MAX" is trivially false. // * For unsigned comparison, if Start - 1 does overflow, it's equal // to UINT_MAX, and "RHS >u UINT_MAX" is trivially false. // // FIXME: Should isLoopEntryGuardedByCond do this for us? auto CondGT = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; auto *StartMinusOne = getAddExpr(OrigStart, getMinusOne(OrigStart->getType())); return isLoopEntryGuardedByCond(L, CondGT, OrigRHS, StartMinusOne); }; // If we know that RHS >= Start in the context of loop, then we know that // max(RHS, Start) = RHS at this point. const SCEV *End; if (canProveRHSGreaterThanEqualStart()) { End = RHS; } else { // If RHS < Start, the backedge will be taken zero times. So in // general, we can write the backedge-taken count as: // // RHS >= Start ? ceil(RHS - Start) / Stride : 0 // // We convert it to the following to make it more convenient for SCEV: // // ceil(max(RHS, Start) - Start) / Stride End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start); // See what would happen if we assume the backedge is taken. This is // used to compute MaxBECount. BECountIfBackedgeTaken = getUDivCeilSCEV(getMinusSCEV(RHS, Start), Stride); } // At this point, we know: // // 1. If IsSigned, Start <=s End; otherwise, Start <=u End // 2. The index variable doesn't overflow. // // Therefore, we know N exists such that // (Start + Stride * N) >= End, and computing "(Start + Stride * N)" // doesn't overflow. // // Using this information, try to prove whether the addition in // "(Start - End) + (Stride - 1)" has unsigned overflow. const SCEV *One = getOne(Stride->getType()); bool MayAddOverflow = [&] { if (auto *StrideC = dyn_cast(Stride)) { if (StrideC->getAPInt().isPowerOf2()) { // Suppose Stride is a power of two, and Start/End are unsigned // integers. Let UMAX be the largest representable unsigned // integer. // // By the preconditions of this function, we know // "(Start + Stride * N) >= End", and this doesn't overflow. // As a formula: // // End <= (Start + Stride * N) <= UMAX // // Subtracting Start from all the terms: // // End - Start <= Stride * N <= UMAX - Start // // Since Start is unsigned, UMAX - Start <= UMAX. Therefore: // // End - Start <= Stride * N <= UMAX // // Stride * N is a multiple of Stride. Therefore, // // End - Start <= Stride * N <= UMAX - (UMAX mod Stride) // // Since Stride is a power of two, UMAX + 1 is divisible by Stride. // Therefore, UMAX mod Stride == Stride - 1. So we can write: // // End - Start <= Stride * N <= UMAX - Stride - 1 // // Dropping the middle term: // // End - Start <= UMAX - Stride - 1 // // Adding Stride - 1 to both sides: // // (End - Start) + (Stride - 1) <= UMAX // // In other words, the addition doesn't have unsigned overflow. // // A similar proof works if we treat Start/End as signed values. // Just rewrite steps before "End - Start <= Stride * N <= UMAX" to // use signed max instead of unsigned max. Note that we're trying // to prove a lack of unsigned overflow in either case. return false; } } if (Start == Stride || Start == getMinusSCEV(Stride, One)) { // If Start is equal to Stride, (End - Start) + (Stride - 1) == End - 1. // If !IsSigned, 0 (BECount)) { ConstantMaxBECount = BECount; } else if (BECountIfBackedgeTaken && isa(BECountIfBackedgeTaken)) { // If we know exactly how many times the backedge will be taken if it's // taken at least once, then the backedge count will either be that or // zero. ConstantMaxBECount = BECountIfBackedgeTaken; MaxOrZero = true; } else { ConstantMaxBECount = computeMaxBECountForLT( Start, Stride, RHS, getTypeSizeInBits(LHS->getType()), IsSigned); } if (isa(ConstantMaxBECount) && !isa(BECount)) ConstantMaxBECount = getConstant(getUnsignedRangeMax(BECount)); const SCEV *SymbolicMaxBECount = isa(BECount) ? ConstantMaxBECount : BECount; return ExitLimit(BECount, ConstantMaxBECount, SymbolicMaxBECount, MaxOrZero, Predicates); } ScalarEvolution::ExitLimit ScalarEvolution::howManyGreaterThans( const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned, bool ControlsOnlyExit, bool AllowPredicates) { SmallPtrSet Predicates; // We handle only IV > Invariant if (!isLoopInvariant(RHS, L)) return getCouldNotCompute(); const SCEVAddRecExpr *IV = dyn_cast(LHS); if (!IV && AllowPredicates) // Try to make this an AddRec using runtime tests, in the first X // iterations of this loop, where X is the SCEV expression found by the // algorithm below. IV = convertSCEVToAddRecWithPredicates(LHS, L, Predicates); // Avoid weird loops if (!IV || IV->getLoop() != L || !IV->isAffine()) return getCouldNotCompute(); auto WrapType = IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW; bool NoWrap = ControlsOnlyExit && IV->getNoWrapFlags(WrapType); ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; const SCEV *Stride = getNegativeSCEV(IV->getStepRecurrence(*this)); // Avoid negative or zero stride values if (!isKnownPositive(Stride)) return getCouldNotCompute(); // Avoid proven overflow cases: this will ensure that the backedge taken count // will not generate any unsigned overflow. Relaxed no-overflow conditions // exploit NoWrapFlags, allowing to optimize in presence of undefined // behaviors like the case of C language. if (!Stride->isOne() && !NoWrap) if (canIVOverflowOnGT(RHS, Stride, IsSigned)) return getCouldNotCompute(); const SCEV *Start = IV->getStart(); const SCEV *End = RHS; if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) { // If we know that Start >= RHS in the context of loop, then we know that // min(RHS, Start) = RHS at this point. if (isLoopEntryGuardedByCond( L, IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, Start, RHS)) End = RHS; else End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start); } if (Start->getType()->isPointerTy()) { Start = getLosslessPtrToIntExpr(Start); if (isa(Start)) return Start; } if (End->getType()->isPointerTy()) { End = getLosslessPtrToIntExpr(End); if (isa(End)) return End; } // Compute ((Start - End) + (Stride - 1)) / Stride. // FIXME: This can overflow. Holding off on fixing this for now; // howManyGreaterThans will hopefully be gone soon. const SCEV *One = getOne(Stride->getType()); const SCEV *BECount = getUDivExpr( getAddExpr(getMinusSCEV(Start, End), getMinusSCEV(Stride, One)), Stride); APInt MaxStart = IsSigned ? getSignedRangeMax(Start) : getUnsignedRangeMax(Start); APInt MinStride = IsSigned ? getSignedRangeMin(Stride) : getUnsignedRangeMin(Stride); unsigned BitWidth = getTypeSizeInBits(LHS->getType()); APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1) : APInt::getMinValue(BitWidth) + (MinStride - 1); // Although End can be a MIN expression we estimate MinEnd considering only // the case End = RHS. This is safe because in the other case (Start - End) // is zero, leading to a zero maximum backedge taken count. APInt MinEnd = IsSigned ? APIntOps::smax(getSignedRangeMin(RHS), Limit) : APIntOps::umax(getUnsignedRangeMin(RHS), Limit); const SCEV *ConstantMaxBECount = isa(BECount) ? BECount : getUDivCeilSCEV(getConstant(MaxStart - MinEnd), getConstant(MinStride)); if (isa(ConstantMaxBECount)) ConstantMaxBECount = BECount; const SCEV *SymbolicMaxBECount = isa(BECount) ? ConstantMaxBECount : BECount; return ExitLimit(BECount, ConstantMaxBECount, SymbolicMaxBECount, false, Predicates); } const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range, ScalarEvolution &SE) const { if (Range.isFullSet()) // Infinite loop. return SE.getCouldNotCompute(); // If the start is a non-zero constant, shift the range to simplify things. if (const SCEVConstant *SC = dyn_cast(getStart())) if (!SC->getValue()->isZero()) { SmallVector Operands(operands()); Operands[0] = SE.getZero(SC->getType()); const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(), getNoWrapFlags(FlagNW)); if (const auto *ShiftedAddRec = dyn_cast(Shifted)) return ShiftedAddRec->getNumIterationsInRange( Range.subtract(SC->getAPInt()), SE); // This is strange and shouldn't happen. return SE.getCouldNotCompute(); } // The only time we can solve this is when we have all constant indices. // Otherwise, we cannot determine the overflow conditions. if (any_of(operands(), [](const SCEV *Op) { return !isa(Op); })) return SE.getCouldNotCompute(); // Okay at this point we know that all elements of the chrec are constants and // that the start element is zero. // First check to see if the range contains zero. If not, the first // iteration exits. unsigned BitWidth = SE.getTypeSizeInBits(getType()); if (!Range.contains(APInt(BitWidth, 0))) return SE.getZero(getType()); if (isAffine()) { // If this is an affine expression then we have this situation: // Solve {0,+,A} in Range === Ax in Range // We know that zero is in the range. If A is positive then we know that // the upper value of the range must be the first possible exit value. // If A is negative then the lower of the range is the last possible loop // value. Also note that we already checked for a full range. APInt A = cast(getOperand(1))->getAPInt(); APInt End = A.sge(1) ? (Range.getUpper() - 1) : Range.getLower(); // The exit value should be (End+A)/A. APInt ExitVal = (End + A).udiv(A); ConstantInt *ExitValue = ConstantInt::get(SE.getContext(), ExitVal); // Evaluate at the exit value. If we really did fall out of the valid // range, then we computed our trip count, otherwise wrap around or other // things must have happened. ConstantInt *Val = EvaluateConstantChrecAtConstant(this, ExitValue, SE); if (Range.contains(Val->getValue())) return SE.getCouldNotCompute(); // Something strange happened // Ensure that the previous value is in the range. assert(Range.contains( EvaluateConstantChrecAtConstant(this, ConstantInt::get(SE.getContext(), ExitVal - 1), SE)->getValue()) && "Linear scev computation is off in a bad way!"); return SE.getConstant(ExitValue); } if (isQuadratic()) { if (auto S = SolveQuadraticAddRecRange(this, Range, SE)) return SE.getConstant(*S); } return SE.getCouldNotCompute(); } const SCEVAddRecExpr * SCEVAddRecExpr::getPostIncExpr(ScalarEvolution &SE) const { assert(getNumOperands() > 1 && "AddRec with zero step?"); // There is a temptation to just call getAddExpr(this, getStepRecurrence(SE)), // but in this case we cannot guarantee that the value returned will be an // AddRec because SCEV does not have a fixed point where it stops // simplification: it is legal to return ({rec1} + {rec2}). For example, it // may happen if we reach arithmetic depth limit while simplifying. So we // construct the returned value explicitly. SmallVector Ops; // If this is {A,+,B,+,C,...,+,N}, then its step is {B,+,C,+,...,+,N}, and // (this + Step) is {A+B,+,B+C,+...,+,N}. for (unsigned i = 0, e = getNumOperands() - 1; i < e; ++i) Ops.push_back(SE.getAddExpr(getOperand(i), getOperand(i + 1))); // We know that the last operand is not a constant zero (otherwise it would // have been popped out earlier). This guarantees us that if the result has // the same last operand, then it will also not be popped out, meaning that // the returned value will be an AddRec. const SCEV *Last = getOperand(getNumOperands() - 1); assert(!Last->isZero() && "Recurrency with zero step?"); Ops.push_back(Last); return cast(SE.getAddRecExpr(Ops, getLoop(), SCEV::FlagAnyWrap)); } // Return true when S contains at least an undef value. bool ScalarEvolution::containsUndefs(const SCEV *S) const { return SCEVExprContains(S, [](const SCEV *S) { if (const auto *SU = dyn_cast(S)) return isa(SU->getValue()); return false; }); } // Return true when S contains a value that is a nullptr. bool ScalarEvolution::containsErasedValue(const SCEV *S) const { return SCEVExprContains(S, [](const SCEV *S) { if (const auto *SU = dyn_cast(S)) return SU->getValue() == nullptr; return false; }); } /// Return the size of an element read or written by Inst. const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) { Type *Ty; if (StoreInst *Store = dyn_cast(Inst)) Ty = Store->getValueOperand()->getType(); else if (LoadInst *Load = dyn_cast(Inst)) Ty = Load->getType(); else return nullptr; Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Ty)); return getSizeOfExpr(ETy, Ty); } //===----------------------------------------------------------------------===// // SCEVCallbackVH Class Implementation //===----------------------------------------------------------------------===// void ScalarEvolution::SCEVCallbackVH::deleted() { assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!"); if (PHINode *PN = dyn_cast(getValPtr())) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(getValPtr()); // this now dangles! } void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) { assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!"); // Forget all the expressions associated with users of the old value, // so that future queries will recompute the expressions using the new // value. Value *Old = getValPtr(); SmallVector Worklist(Old->users()); SmallPtrSet Visited; while (!Worklist.empty()) { User *U = Worklist.pop_back_val(); // Deleting the Old value will cause this to dangle. Postpone // that until everything else is done. if (U == Old) continue; if (!Visited.insert(U).second) continue; if (PHINode *PN = dyn_cast(U)) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(U); llvm::append_range(Worklist, U->users()); } // Delete the Old value. if (PHINode *PN = dyn_cast(Old)) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(Old); // this now dangles! } ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se) : CallbackVH(V), SE(se) {} //===----------------------------------------------------------------------===// // ScalarEvolution Class Implementation //===----------------------------------------------------------------------===// ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, LoopInfo &LI) : F(F), TLI(TLI), AC(AC), DT(DT), LI(LI), CouldNotCompute(new SCEVCouldNotCompute()), ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64) { // To use guards for proving predicates, we need to scan every instruction in // relevant basic blocks, and not just terminators. Doing this is a waste of // time if the IR does not actually contain any calls to // @llvm.experimental.guard, so do a quick check and remember this beforehand. // // This pessimizes the case where a pass that preserves ScalarEvolution wants // to _add_ guards to the module when there weren't any before, and wants // ScalarEvolution to optimize based on those guards. For now we prefer to be // efficient in lieu of being smart in that rather obscure case. auto *GuardDecl = F.getParent()->getFunction( Intrinsic::getName(Intrinsic::experimental_guard)); HasGuards = GuardDecl && !GuardDecl->use_empty(); } ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg) : F(Arg.F), HasGuards(Arg.HasGuards), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT), LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)), ValueExprMap(std::move(Arg.ValueExprMap)), PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)), PendingPhiRanges(std::move(Arg.PendingPhiRanges)), PendingMerges(std::move(Arg.PendingMerges)), ConstantMultipleCache(std::move(Arg.ConstantMultipleCache)), BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)), PredicatedBackedgeTakenCounts( std::move(Arg.PredicatedBackedgeTakenCounts)), BECountUsers(std::move(Arg.BECountUsers)), ConstantEvolutionLoopExitValue( std::move(Arg.ConstantEvolutionLoopExitValue)), ValuesAtScopes(std::move(Arg.ValuesAtScopes)), ValuesAtScopesUsers(std::move(Arg.ValuesAtScopesUsers)), LoopDispositions(std::move(Arg.LoopDispositions)), LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)), BlockDispositions(std::move(Arg.BlockDispositions)), SCEVUsers(std::move(Arg.SCEVUsers)), UnsignedRanges(std::move(Arg.UnsignedRanges)), SignedRanges(std::move(Arg.SignedRanges)), UniqueSCEVs(std::move(Arg.UniqueSCEVs)), UniquePreds(std::move(Arg.UniquePreds)), SCEVAllocator(std::move(Arg.SCEVAllocator)), LoopUsers(std::move(Arg.LoopUsers)), PredicatedSCEVRewrites(std::move(Arg.PredicatedSCEVRewrites)), FirstUnknown(Arg.FirstUnknown) { Arg.FirstUnknown = nullptr; } ScalarEvolution::~ScalarEvolution() { // Iterate through all the SCEVUnknown instances and call their // destructors, so that they release their references to their values. for (SCEVUnknown *U = FirstUnknown; U;) { SCEVUnknown *Tmp = U; U = U->Next; Tmp->~SCEVUnknown(); } FirstUnknown = nullptr; ExprValueMap.clear(); ValueExprMap.clear(); HasRecMap.clear(); BackedgeTakenCounts.clear(); PredicatedBackedgeTakenCounts.clear(); assert(PendingLoopPredicates.empty() && "isImpliedCond garbage"); assert(PendingPhiRanges.empty() && "getRangeRef garbage"); assert(PendingMerges.empty() && "isImpliedViaMerge garbage"); assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!"); assert(!ProvingSplitPredicate && "ProvingSplitPredicate garbage!"); } bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) { return !isa(getBackedgeTakenCount(L)); } static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, const Loop *L) { // Print all inner loops first for (Loop *I : *L) PrintLoopInfo(OS, SE, I); OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); if (ExitingBlocks.size() != 1) OS << " "; if (SE->hasLoopInvariantBackedgeTakenCount(L)) OS << "backedge-taken count is " << *SE->getBackedgeTakenCount(L) << "\n"; else OS << "Unpredictable backedge-taken count.\n"; if (ExitingBlocks.size() > 1) for (BasicBlock *ExitingBlock : ExitingBlocks) { OS << " exit count for " << ExitingBlock->getName() << ": " << *SE->getExitCount(L, ExitingBlock) << "\n"; } OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; auto *ConstantBTC = SE->getConstantMaxBackedgeTakenCount(L); if (!isa(ConstantBTC)) { OS << "constant max backedge-taken count is " << *ConstantBTC; if (SE->isBackedgeTakenCountMaxOrZero(L)) OS << ", actual taken count either this or zero."; } else { OS << "Unpredictable constant max backedge-taken count. "; } OS << "\n" "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; auto *SymbolicBTC = SE->getSymbolicMaxBackedgeTakenCount(L); if (!isa(SymbolicBTC)) { OS << "symbolic max backedge-taken count is " << *SymbolicBTC; if (SE->isBackedgeTakenCountMaxOrZero(L)) OS << ", actual taken count either this or zero."; } else { OS << "Unpredictable symbolic max backedge-taken count. "; } OS << "\n"; if (ExitingBlocks.size() > 1) for (BasicBlock *ExitingBlock : ExitingBlocks) { OS << " symbolic max exit count for " << ExitingBlock->getName() << ": " << *SE->getExitCount(L, ExitingBlock, ScalarEvolution::SymbolicMaximum) << "\n"; } OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; SmallVector Preds; auto PBT = SE->getPredicatedBackedgeTakenCount(L, Preds); if (!isa(PBT)) { OS << "Predicated backedge-taken count is " << *PBT << "\n"; OS << " Predicates:\n"; for (const auto *P : Preds) P->print(OS, 4); } else { OS << "Unpredictable predicated backedge-taken count. "; } OS << "\n"; if (SE->hasLoopInvariantBackedgeTakenCount(L)) { OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; OS << "Trip multiple is " << SE->getSmallConstantTripMultiple(L) << "\n"; } } namespace llvm { raw_ostream &operator<<(raw_ostream &OS, ScalarEvolution::LoopDisposition LD) { switch (LD) { case ScalarEvolution::LoopVariant: OS << "Variant"; break; case ScalarEvolution::LoopInvariant: OS << "Invariant"; break; case ScalarEvolution::LoopComputable: OS << "Computable"; break; } return OS; } raw_ostream &operator<<(raw_ostream &OS, ScalarEvolution::BlockDisposition BD) { switch (BD) { case ScalarEvolution::DoesNotDominateBlock: OS << "DoesNotDominate"; break; case ScalarEvolution::DominatesBlock: OS << "Dominates"; break; case ScalarEvolution::ProperlyDominatesBlock: OS << "ProperlyDominates"; break; } return OS; } } void ScalarEvolution::print(raw_ostream &OS) const { // ScalarEvolution's implementation of the print method is to print // out SCEV values of all instructions that are interesting. Doing // this potentially causes it to create new SCEV objects though, // which technically conflicts with the const qualifier. This isn't // observable from outside the class though, so casting away the // const isn't dangerous. ScalarEvolution &SE = *const_cast(this); if (ClassifyExpressions) { OS << "Classifying expressions for: "; F.printAsOperand(OS, /*PrintType=*/false); OS << "\n"; for (Instruction &I : instructions(F)) if (isSCEVable(I.getType()) && !isa(I)) { OS << I << '\n'; OS << " --> "; const SCEV *SV = SE.getSCEV(&I); SV->print(OS); if (!isa(SV)) { OS << " U: "; SE.getUnsignedRange(SV).print(OS); OS << " S: "; SE.getSignedRange(SV).print(OS); } const Loop *L = LI.getLoopFor(I.getParent()); const SCEV *AtUse = SE.getSCEVAtScope(SV, L); if (AtUse != SV) { OS << " --> "; AtUse->print(OS); if (!isa(AtUse)) { OS << " U: "; SE.getUnsignedRange(AtUse).print(OS); OS << " S: "; SE.getSignedRange(AtUse).print(OS); } } if (L) { OS << "\t\t" "Exits: "; const SCEV *ExitValue = SE.getSCEVAtScope(SV, L->getParentLoop()); if (!SE.isLoopInvariant(ExitValue, L)) { OS << "<>"; } else { OS << *ExitValue; } bool First = true; for (const auto *Iter = L; Iter; Iter = Iter->getParentLoop()) { if (First) { OS << "\t\t" "LoopDispositions: { "; First = false; } else { OS << ", "; } Iter->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": " << SE.getLoopDisposition(SV, Iter); } for (const auto *InnerL : depth_first(L)) { if (InnerL == L) continue; if (First) { OS << "\t\t" "LoopDispositions: { "; First = false; } else { OS << ", "; } InnerL->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": " << SE.getLoopDisposition(SV, InnerL); } OS << " }"; } OS << "\n"; } } OS << "Determining loop execution counts for: "; F.printAsOperand(OS, /*PrintType=*/false); OS << "\n"; for (Loop *I : LI) PrintLoopInfo(OS, &SE, I); } ScalarEvolution::LoopDisposition ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) { auto &Values = LoopDispositions[S]; for (auto &V : Values) { if (V.getPointer() == L) return V.getInt(); } Values.emplace_back(L, LoopVariant); LoopDisposition D = computeLoopDisposition(S, L); auto &Values2 = LoopDispositions[S]; for (auto &V : llvm::reverse(Values2)) { if (V.getPointer() == L) { V.setInt(D); break; } } return D; } ScalarEvolution::LoopDisposition ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { switch (S->getSCEVType()) { case scConstant: case scVScale: return LoopInvariant; case scAddRecExpr: { const SCEVAddRecExpr *AR = cast(S); // If L is the addrec's loop, it's computable. if (AR->getLoop() == L) return LoopComputable; // Add recurrences are never invariant in the function-body (null loop). if (!L) return LoopVariant; // Everything that is not defined at loop entry is variant. if (DT.dominates(L->getHeader(), AR->getLoop()->getHeader())) return LoopVariant; assert(!L->contains(AR->getLoop()) && "Containing loop's header does not" " dominate the contained loop's header?"); // This recurrence is invariant w.r.t. L if AR's loop contains L. if (AR->getLoop()->contains(L)) return LoopInvariant; // This recurrence is variant w.r.t. L if any of its operands // are variant. for (const auto *Op : AR->operands()) if (!isLoopInvariant(Op, L)) return LoopVariant; // Otherwise it's loop-invariant. return LoopInvariant; } case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: case scAddExpr: case scMulExpr: case scUDivExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: { bool HasVarying = false; for (const auto *Op : S->operands()) { LoopDisposition D = getLoopDisposition(Op, L); if (D == LoopVariant) return LoopVariant; if (D == LoopComputable) HasVarying = true; } return HasVarying ? LoopComputable : LoopInvariant; } case scUnknown: // All non-instruction values are loop invariant. All instructions are loop // invariant if they are not contained in the specified loop. // Instructions are never considered invariant in the function body // (null loop) because they are defined within the "loop". if (auto *I = dyn_cast(cast(S)->getValue())) return (L && !L->contains(I)) ? LoopInvariant : LoopVariant; return LoopInvariant; case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } bool ScalarEvolution::isLoopInvariant(const SCEV *S, const Loop *L) { return getLoopDisposition(S, L) == LoopInvariant; } bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) { return getLoopDisposition(S, L) == LoopComputable; } ScalarEvolution::BlockDisposition ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) { auto &Values = BlockDispositions[S]; for (auto &V : Values) { if (V.getPointer() == BB) return V.getInt(); } Values.emplace_back(BB, DoesNotDominateBlock); BlockDisposition D = computeBlockDisposition(S, BB); auto &Values2 = BlockDispositions[S]; for (auto &V : llvm::reverse(Values2)) { if (V.getPointer() == BB) { V.setInt(D); break; } } return D; } ScalarEvolution::BlockDisposition ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { switch (S->getSCEVType()) { case scConstant: case scVScale: return ProperlyDominatesBlock; case scAddRecExpr: { // This uses a "dominates" query instead of "properly dominates" query // to test for proper dominance too, because the instruction which // produces the addrec's value is a PHI, and a PHI effectively properly // dominates its entire containing block. const SCEVAddRecExpr *AR = cast(S); if (!DT.dominates(AR->getLoop()->getHeader(), BB)) return DoesNotDominateBlock; // Fall through into SCEVNAryExpr handling. [[fallthrough]]; } case scTruncate: case scZeroExtend: case scSignExtend: case scPtrToInt: case scAddExpr: case scMulExpr: case scUDivExpr: case scUMaxExpr: case scSMaxExpr: case scUMinExpr: case scSMinExpr: case scSequentialUMinExpr: { bool Proper = true; for (const SCEV *NAryOp : S->operands()) { BlockDisposition D = getBlockDisposition(NAryOp, BB); if (D == DoesNotDominateBlock) return DoesNotDominateBlock; if (D == DominatesBlock) Proper = false; } return Proper ? ProperlyDominatesBlock : DominatesBlock; } case scUnknown: if (Instruction *I = dyn_cast(cast(S)->getValue())) { if (I->getParent() == BB) return DominatesBlock; if (DT.properlyDominates(I->getParent(), BB)) return ProperlyDominatesBlock; return DoesNotDominateBlock; } return ProperlyDominatesBlock; case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } bool ScalarEvolution::dominates(const SCEV *S, const BasicBlock *BB) { return getBlockDisposition(S, BB) >= DominatesBlock; } bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) { return getBlockDisposition(S, BB) == ProperlyDominatesBlock; } bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { return SCEVExprContains(S, [&](const SCEV *Expr) { return Expr == Op; }); } void ScalarEvolution::forgetBackedgeTakenCounts(const Loop *L, bool Predicated) { auto &BECounts = Predicated ? PredicatedBackedgeTakenCounts : BackedgeTakenCounts; auto It = BECounts.find(L); if (It != BECounts.end()) { for (const ExitNotTakenInfo &ENT : It->second.ExitNotTaken) { for (const SCEV *S : {ENT.ExactNotTaken, ENT.SymbolicMaxNotTaken}) { if (!isa(S)) { auto UserIt = BECountUsers.find(S); assert(UserIt != BECountUsers.end()); UserIt->second.erase({L, Predicated}); } } } BECounts.erase(It); } } void ScalarEvolution::forgetMemoizedResults(ArrayRef SCEVs) { SmallPtrSet ToForget(SCEVs.begin(), SCEVs.end()); SmallVector Worklist(ToForget.begin(), ToForget.end()); while (!Worklist.empty()) { const SCEV *Curr = Worklist.pop_back_val(); auto Users = SCEVUsers.find(Curr); if (Users != SCEVUsers.end()) for (const auto *User : Users->second) if (ToForget.insert(User).second) Worklist.push_back(User); } for (const auto *S : ToForget) forgetMemoizedResultsImpl(S); for (auto I = PredicatedSCEVRewrites.begin(); I != PredicatedSCEVRewrites.end();) { std::pair Entry = I->first; if (ToForget.count(Entry.first)) PredicatedSCEVRewrites.erase(I++); else ++I; } } void ScalarEvolution::forgetMemoizedResultsImpl(const SCEV *S) { LoopDispositions.erase(S); BlockDispositions.erase(S); UnsignedRanges.erase(S); SignedRanges.erase(S); HasRecMap.erase(S); ConstantMultipleCache.erase(S); if (auto *AR = dyn_cast(S)) { UnsignedWrapViaInductionTried.erase(AR); SignedWrapViaInductionTried.erase(AR); } auto ExprIt = ExprValueMap.find(S); if (ExprIt != ExprValueMap.end()) { for (Value *V : ExprIt->second) { auto ValueIt = ValueExprMap.find_as(V); if (ValueIt != ValueExprMap.end()) ValueExprMap.erase(ValueIt); } ExprValueMap.erase(ExprIt); } auto ScopeIt = ValuesAtScopes.find(S); if (ScopeIt != ValuesAtScopes.end()) { for (const auto &Pair : ScopeIt->second) if (!isa_and_nonnull(Pair.second)) erase_value(ValuesAtScopesUsers[Pair.second], std::make_pair(Pair.first, S)); ValuesAtScopes.erase(ScopeIt); } auto ScopeUserIt = ValuesAtScopesUsers.find(S); if (ScopeUserIt != ValuesAtScopesUsers.end()) { for (const auto &Pair : ScopeUserIt->second) erase_value(ValuesAtScopes[Pair.second], std::make_pair(Pair.first, S)); ValuesAtScopesUsers.erase(ScopeUserIt); } auto BEUsersIt = BECountUsers.find(S); if (BEUsersIt != BECountUsers.end()) { // Work on a copy, as forgetBackedgeTakenCounts() will modify the original. auto Copy = BEUsersIt->second; for (const auto &Pair : Copy) forgetBackedgeTakenCounts(Pair.getPointer(), Pair.getInt()); BECountUsers.erase(BEUsersIt); } auto FoldUser = FoldCacheUser.find(S); if (FoldUser != FoldCacheUser.end()) for (auto &KV : FoldUser->second) FoldCache.erase(KV); FoldCacheUser.erase(S); } void ScalarEvolution::getUsedLoops(const SCEV *S, SmallPtrSetImpl &LoopsUsed) { struct FindUsedLoops { FindUsedLoops(SmallPtrSetImpl &LoopsUsed) : LoopsUsed(LoopsUsed) {} SmallPtrSetImpl &LoopsUsed; bool follow(const SCEV *S) { if (auto *AR = dyn_cast(S)) LoopsUsed.insert(AR->getLoop()); return true; } bool isDone() const { return false; } }; FindUsedLoops F(LoopsUsed); SCEVTraversal(F).visitAll(S); } void ScalarEvolution::getReachableBlocks( SmallPtrSetImpl &Reachable, Function &F) { SmallVector Worklist; Worklist.push_back(&F.getEntryBlock()); while (!Worklist.empty()) { BasicBlock *BB = Worklist.pop_back_val(); if (!Reachable.insert(BB).second) continue; Value *Cond; BasicBlock *TrueBB, *FalseBB; if (match(BB->getTerminator(), m_Br(m_Value(Cond), m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) { if (auto *C = dyn_cast(Cond)) { Worklist.push_back(C->isOne() ? TrueBB : FalseBB); continue; } if (auto *Cmp = dyn_cast(Cond)) { const SCEV *L = getSCEV(Cmp->getOperand(0)); const SCEV *R = getSCEV(Cmp->getOperand(1)); if (isKnownPredicateViaConstantRanges(Cmp->getPredicate(), L, R)) { Worklist.push_back(TrueBB); continue; } if (isKnownPredicateViaConstantRanges(Cmp->getInversePredicate(), L, R)) { Worklist.push_back(FalseBB); continue; } } } append_range(Worklist, successors(BB)); } } void ScalarEvolution::verify() const { ScalarEvolution &SE = *const_cast(this); ScalarEvolution SE2(F, TLI, AC, DT, LI); SmallVector LoopStack(LI.begin(), LI.end()); // Map's SCEV expressions from one ScalarEvolution "universe" to another. struct SCEVMapper : public SCEVRewriteVisitor { SCEVMapper(ScalarEvolution &SE) : SCEVRewriteVisitor(SE) {} const SCEV *visitConstant(const SCEVConstant *Constant) { return SE.getConstant(Constant->getAPInt()); } const SCEV *visitUnknown(const SCEVUnknown *Expr) { return SE.getUnknown(Expr->getValue()); } const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { return SE.getCouldNotCompute(); } }; SCEVMapper SCM(SE2); SmallPtrSet ReachableBlocks; SE2.getReachableBlocks(ReachableBlocks, F); auto GetDelta = [&](const SCEV *Old, const SCEV *New) -> const SCEV * { if (containsUndefs(Old) || containsUndefs(New)) { // SCEV treats "undef" as an unknown but consistent value (i.e. it does // not propagate undef aggressively). This means we can (and do) fail // verification in cases where a transform makes a value go from "undef" // to "undef+1" (say). The transform is fine, since in both cases the // result is "undef", but SCEV thinks the value increased by 1. return nullptr; } // Unless VerifySCEVStrict is set, we only compare constant deltas. const SCEV *Delta = SE2.getMinusSCEV(Old, New); if (!VerifySCEVStrict && !isa(Delta)) return nullptr; return Delta; }; while (!LoopStack.empty()) { auto *L = LoopStack.pop_back_val(); llvm::append_range(LoopStack, *L); // Only verify BECounts in reachable loops. For an unreachable loop, // any BECount is legal. if (!ReachableBlocks.contains(L->getHeader())) continue; // Only verify cached BECounts. Computing new BECounts may change the // results of subsequent SCEV uses. auto It = BackedgeTakenCounts.find(L); if (It == BackedgeTakenCounts.end()) continue; auto *CurBECount = SCM.visit(It->second.getExact(L, const_cast(this))); auto *NewBECount = SE2.getBackedgeTakenCount(L); if (CurBECount == SE2.getCouldNotCompute() || NewBECount == SE2.getCouldNotCompute()) { // NB! This situation is legal, but is very suspicious -- whatever pass // change the loop to make a trip count go from could not compute to // computable or vice-versa *should have* invalidated SCEV. However, we // choose not to assert here (for now) since we don't want false // positives. continue; } if (SE.getTypeSizeInBits(CurBECount->getType()) > SE.getTypeSizeInBits(NewBECount->getType())) NewBECount = SE2.getZeroExtendExpr(NewBECount, CurBECount->getType()); else if (SE.getTypeSizeInBits(CurBECount->getType()) < SE.getTypeSizeInBits(NewBECount->getType())) CurBECount = SE2.getZeroExtendExpr(CurBECount, NewBECount->getType()); const SCEV *Delta = GetDelta(CurBECount, NewBECount); if (Delta && !Delta->isZero()) { dbgs() << "Trip Count for " << *L << " Changed!\n"; dbgs() << "Old: " << *CurBECount << "\n"; dbgs() << "New: " << *NewBECount << "\n"; dbgs() << "Delta: " << *Delta << "\n"; std::abort(); } } // Collect all valid loops currently in LoopInfo. SmallPtrSet ValidLoops; SmallVector Worklist(LI.begin(), LI.end()); while (!Worklist.empty()) { Loop *L = Worklist.pop_back_val(); if (ValidLoops.insert(L).second) Worklist.append(L->begin(), L->end()); } for (const auto &KV : ValueExprMap) { #ifndef NDEBUG // Check for SCEV expressions referencing invalid/deleted loops. if (auto *AR = dyn_cast(KV.second)) { assert(ValidLoops.contains(AR->getLoop()) && "AddRec references invalid loop"); } #endif // Check that the value is also part of the reverse map. auto It = ExprValueMap.find(KV.second); if (It == ExprValueMap.end() || !It->second.contains(KV.first)) { dbgs() << "Value " << *KV.first << " is in ValueExprMap but not in ExprValueMap\n"; std::abort(); } if (auto *I = dyn_cast(&*KV.first)) { if (!ReachableBlocks.contains(I->getParent())) continue; const SCEV *OldSCEV = SCM.visit(KV.second); const SCEV *NewSCEV = SE2.getSCEV(I); const SCEV *Delta = GetDelta(OldSCEV, NewSCEV); if (Delta && !Delta->isZero()) { dbgs() << "SCEV for value " << *I << " changed!\n" << "Old: " << *OldSCEV << "\n" << "New: " << *NewSCEV << "\n" << "Delta: " << *Delta << "\n"; std::abort(); } } } for (const auto &KV : ExprValueMap) { for (Value *V : KV.second) { auto It = ValueExprMap.find_as(V); if (It == ValueExprMap.end()) { dbgs() << "Value " << *V << " is in ExprValueMap but not in ValueExprMap\n"; std::abort(); } if (It->second != KV.first) { dbgs() << "Value " << *V << " mapped to " << *It->second << " rather than " << *KV.first << "\n"; std::abort(); } } } // Verify integrity of SCEV users. for (const auto &S : UniqueSCEVs) { for (const auto *Op : S.operands()) { // We do not store dependencies of constants. if (isa(Op)) continue; auto It = SCEVUsers.find(Op); if (It != SCEVUsers.end() && It->second.count(&S)) continue; dbgs() << "Use of operand " << *Op << " by user " << S << " is not being tracked!\n"; std::abort(); } } // Verify integrity of ValuesAtScopes users. for (const auto &ValueAndVec : ValuesAtScopes) { const SCEV *Value = ValueAndVec.first; for (const auto &LoopAndValueAtScope : ValueAndVec.second) { const Loop *L = LoopAndValueAtScope.first; const SCEV *ValueAtScope = LoopAndValueAtScope.second; if (!isa(ValueAtScope)) { auto It = ValuesAtScopesUsers.find(ValueAtScope); if (It != ValuesAtScopesUsers.end() && is_contained(It->second, std::make_pair(L, Value))) continue; dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: " << *ValueAtScope << " missing in ValuesAtScopesUsers\n"; std::abort(); } } } for (const auto &ValueAtScopeAndVec : ValuesAtScopesUsers) { const SCEV *ValueAtScope = ValueAtScopeAndVec.first; for (const auto &LoopAndValue : ValueAtScopeAndVec.second) { const Loop *L = LoopAndValue.first; const SCEV *Value = LoopAndValue.second; assert(!isa(Value)); auto It = ValuesAtScopes.find(Value); if (It != ValuesAtScopes.end() && is_contained(It->second, std::make_pair(L, ValueAtScope))) continue; dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: " << *ValueAtScope << " missing in ValuesAtScopes\n"; std::abort(); } } // Verify integrity of BECountUsers. auto VerifyBECountUsers = [&](bool Predicated) { auto &BECounts = Predicated ? PredicatedBackedgeTakenCounts : BackedgeTakenCounts; for (const auto &LoopAndBEInfo : BECounts) { for (const ExitNotTakenInfo &ENT : LoopAndBEInfo.second.ExitNotTaken) { for (const SCEV *S : {ENT.ExactNotTaken, ENT.SymbolicMaxNotTaken}) { if (!isa(S)) { auto UserIt = BECountUsers.find(S); if (UserIt != BECountUsers.end() && UserIt->second.contains({ LoopAndBEInfo.first, Predicated })) continue; dbgs() << "Value " << *S << " for loop " << *LoopAndBEInfo.first << " missing from BECountUsers\n"; std::abort(); } } } } }; VerifyBECountUsers(/* Predicated */ false); VerifyBECountUsers(/* Predicated */ true); // Verify intergity of loop disposition cache. for (auto &[S, Values] : LoopDispositions) { for (auto [Loop, CachedDisposition] : Values) { const auto RecomputedDisposition = SE2.getLoopDisposition(S, Loop); if (CachedDisposition != RecomputedDisposition) { dbgs() << "Cached disposition of " << *S << " for loop " << *Loop << " is incorrect: cached " << CachedDisposition << ", actual " << RecomputedDisposition << "\n"; std::abort(); } } } // Verify integrity of the block disposition cache. for (auto &[S, Values] : BlockDispositions) { for (auto [BB, CachedDisposition] : Values) { const auto RecomputedDisposition = SE2.getBlockDisposition(S, BB); if (CachedDisposition != RecomputedDisposition) { dbgs() << "Cached disposition of " << *S << " for block %" << BB->getName() << " is incorrect: cached " << CachedDisposition << ", actual " << RecomputedDisposition << "\n"; std::abort(); } } } // Verify FoldCache/FoldCacheUser caches. for (auto [FoldID, Expr] : FoldCache) { auto I = FoldCacheUser.find(Expr); if (I == FoldCacheUser.end()) { dbgs() << "Missing entry in FoldCacheUser for cached expression " << *Expr << "!\n"; std::abort(); } if (!is_contained(I->second, FoldID)) { dbgs() << "Missing FoldID in cached users of " << *Expr << "!\n"; std::abort(); } } for (auto [Expr, IDs] : FoldCacheUser) { for (auto &FoldID : IDs) { auto I = FoldCache.find(FoldID); if (I == FoldCache.end()) { dbgs() << "Missing entry in FoldCache for expression " << *Expr << "!\n"; std::abort(); } if (I->second != Expr) { dbgs() << "Entry in FoldCache doesn't match FoldCacheUser: " << *I->second << " != " << *Expr << "!\n"; std::abort(); } } } // Verify that ConstantMultipleCache computations are correct. We check that // cached multiples and recomputed multiples are multiples of each other to // verify correctness. It is possible that a recomputed multiple is different // from the cached multiple due to strengthened no wrap flags or changes in // KnownBits computations. for (auto [S, Multiple] : ConstantMultipleCache) { APInt RecomputedMultiple = SE2.getConstantMultiple(S); if ((Multiple != 0 && RecomputedMultiple != 0 && Multiple.urem(RecomputedMultiple) != 0 && RecomputedMultiple.urem(Multiple) != 0)) { dbgs() << "Incorrect cached computation in ConstantMultipleCache for " << *S << " : Computed " << RecomputedMultiple << " but cache contains " << Multiple << "!\n"; std::abort(); } } } bool ScalarEvolution::invalidate( Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv) { // Invalidate the ScalarEvolution object whenever it isn't preserved or one // of its dependencies is invalidated. auto PAC = PA.getChecker(); return !(PAC.preserved() || PAC.preservedSet>()) || Inv.invalidate(F, PA) || Inv.invalidate(F, PA) || Inv.invalidate(F, PA); } AnalysisKey ScalarEvolutionAnalysis::Key; ScalarEvolution ScalarEvolutionAnalysis::run(Function &F, FunctionAnalysisManager &AM) { auto &TLI = AM.getResult(F); auto &AC = AM.getResult(F); auto &DT = AM.getResult(F); auto &LI = AM.getResult(F); return ScalarEvolution(F, TLI, AC, DT, LI); } PreservedAnalyses ScalarEvolutionVerifierPass::run(Function &F, FunctionAnalysisManager &AM) { AM.getResult(F).verify(); return PreservedAnalyses::all(); } PreservedAnalyses ScalarEvolutionPrinterPass::run(Function &F, FunctionAnalysisManager &AM) { // For compatibility with opt's -analyze feature under legacy pass manager // which was not ported to NPM. This keeps tests using // update_analyze_test_checks.py working. OS << "Printing analysis 'Scalar Evolution Analysis' for function '" << F.getName() << "':\n"; AM.getResult(F).print(OS); return PreservedAnalyses::all(); } INITIALIZE_PASS_BEGIN(ScalarEvolutionWrapperPass, "scalar-evolution", "Scalar Evolution Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ScalarEvolutionWrapperPass, "scalar-evolution", "Scalar Evolution Analysis", false, true) char ScalarEvolutionWrapperPass::ID = 0; ScalarEvolutionWrapperPass::ScalarEvolutionWrapperPass() : FunctionPass(ID) { initializeScalarEvolutionWrapperPassPass(*PassRegistry::getPassRegistry()); } bool ScalarEvolutionWrapperPass::runOnFunction(Function &F) { SE.reset(new ScalarEvolution( F, getAnalysis().getTLI(F), getAnalysis().getAssumptionCache(F), getAnalysis().getDomTree(), getAnalysis().getLoopInfo())); return false; } void ScalarEvolutionWrapperPass::releaseMemory() { SE.reset(); } void ScalarEvolutionWrapperPass::print(raw_ostream &OS, const Module *) const { SE->print(OS); } void ScalarEvolutionWrapperPass::verifyAnalysis() const { if (!VerifySCEV) return; SE->verify(); } void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); } const SCEVPredicate *ScalarEvolution::getEqualPredicate(const SCEV *LHS, const SCEV *RHS) { return getComparePredicate(ICmpInst::ICMP_EQ, LHS, RHS); } const SCEVPredicate * ScalarEvolution::getComparePredicate(const ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { FoldingSetNodeID ID; assert(LHS->getType() == RHS->getType() && "Type mismatch between LHS and RHS"); // Unique this node based on the arguments ID.AddInteger(SCEVPredicate::P_Compare); ID.AddInteger(Pred); ID.AddPointer(LHS); ID.AddPointer(RHS); void *IP = nullptr; if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP)) return S; SCEVComparePredicate *Eq = new (SCEVAllocator) SCEVComparePredicate(ID.Intern(SCEVAllocator), Pred, LHS, RHS); UniquePreds.InsertNode(Eq, IP); return Eq; } const SCEVPredicate *ScalarEvolution::getWrapPredicate( const SCEVAddRecExpr *AR, SCEVWrapPredicate::IncrementWrapFlags AddedFlags) { FoldingSetNodeID ID; // Unique this node based on the arguments ID.AddInteger(SCEVPredicate::P_Wrap); ID.AddPointer(AR); ID.AddInteger(AddedFlags); void *IP = nullptr; if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP)) return S; auto *OF = new (SCEVAllocator) SCEVWrapPredicate(ID.Intern(SCEVAllocator), AR, AddedFlags); UniquePreds.InsertNode(OF, IP); return OF; } namespace { class SCEVPredicateRewriter : public SCEVRewriteVisitor { public: /// Rewrites \p S in the context of a loop L and the SCEV predication /// infrastructure. /// /// If \p Pred is non-null, the SCEV expression is rewritten to respect the /// equivalences present in \p Pred. /// /// If \p NewPreds is non-null, rewrite is free to add further predicates to /// \p NewPreds such that the result will be an AddRecExpr. static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE, SmallPtrSetImpl *NewPreds, const SCEVPredicate *Pred) { SCEVPredicateRewriter Rewriter(L, SE, NewPreds, Pred); return Rewriter.visit(S); } const SCEV *visitUnknown(const SCEVUnknown *Expr) { if (Pred) { if (auto *U = dyn_cast(Pred)) { for (const auto *Pred : U->getPredicates()) if (const auto *IPred = dyn_cast(Pred)) if (IPred->getLHS() == Expr && IPred->getPredicate() == ICmpInst::ICMP_EQ) return IPred->getRHS(); } else if (const auto *IPred = dyn_cast(Pred)) { if (IPred->getLHS() == Expr && IPred->getPredicate() == ICmpInst::ICMP_EQ) return IPred->getRHS(); } } return convertToAddRecWithPreds(Expr); } const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { const SCEV *Operand = visit(Expr->getOperand()); const SCEVAddRecExpr *AR = dyn_cast(Operand); if (AR && AR->getLoop() == L && AR->isAffine()) { // This couldn't be folded because the operand didn't have the nuw // flag. Add the nusw flag as an assumption that we could make. const SCEV *Step = AR->getStepRecurrence(SE); Type *Ty = Expr->getType(); if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNUSW)) return SE.getAddRecExpr(SE.getZeroExtendExpr(AR->getStart(), Ty), SE.getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } return SE.getZeroExtendExpr(Operand, Expr->getType()); } const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { const SCEV *Operand = visit(Expr->getOperand()); const SCEVAddRecExpr *AR = dyn_cast(Operand); if (AR && AR->getLoop() == L && AR->isAffine()) { // This couldn't be folded because the operand didn't have the nsw // flag. Add the nssw flag as an assumption that we could make. const SCEV *Step = AR->getStepRecurrence(SE); Type *Ty = Expr->getType(); if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNSSW)) return SE.getAddRecExpr(SE.getSignExtendExpr(AR->getStart(), Ty), SE.getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } return SE.getSignExtendExpr(Operand, Expr->getType()); } private: explicit SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE, SmallPtrSetImpl *NewPreds, const SCEVPredicate *Pred) : SCEVRewriteVisitor(SE), NewPreds(NewPreds), Pred(Pred), L(L) {} bool addOverflowAssumption(const SCEVPredicate *P) { if (!NewPreds) { // Check if we've already made this assumption. return Pred && Pred->implies(P); } NewPreds->insert(P); return true; } bool addOverflowAssumption(const SCEVAddRecExpr *AR, SCEVWrapPredicate::IncrementWrapFlags AddedFlags) { auto *A = SE.getWrapPredicate(AR, AddedFlags); return addOverflowAssumption(A); } // If \p Expr represents a PHINode, we try to see if it can be represented // as an AddRec, possibly under a predicate (PHISCEVPred). If it is possible // to add this predicate as a runtime overflow check, we return the AddRec. // If \p Expr does not meet these conditions (is not a PHI node, or we // couldn't create an AddRec for it, or couldn't add the predicate), we just // return \p Expr. const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) { if (!isa(Expr->getValue())) return Expr; std::optional< std::pair>> PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr); if (!PredicatedRewrite) return Expr; for (const auto *P : PredicatedRewrite->second){ // Wrap predicates from outer loops are not supported. if (auto *WP = dyn_cast(P)) { if (L != WP->getExpr()->getLoop()) return Expr; } if (!addOverflowAssumption(P)) return Expr; } return PredicatedRewrite->first; } SmallPtrSetImpl *NewPreds; const SCEVPredicate *Pred; const Loop *L; }; } // end anonymous namespace const SCEV * ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L, const SCEVPredicate &Preds) { return SCEVPredicateRewriter::rewrite(S, L, *this, nullptr, &Preds); } const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates( const SCEV *S, const Loop *L, SmallPtrSetImpl &Preds) { SmallPtrSet TransformPreds; S = SCEVPredicateRewriter::rewrite(S, L, *this, &TransformPreds, nullptr); auto *AddRec = dyn_cast(S); if (!AddRec) return nullptr; // Since the transformation was successful, we can now transfer the SCEV // predicates. for (const auto *P : TransformPreds) Preds.insert(P); return AddRec; } /// SCEV predicates SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID, SCEVPredicateKind Kind) : FastID(ID), Kind(Kind) {} SCEVComparePredicate::SCEVComparePredicate(const FoldingSetNodeIDRef ID, const ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) : SCEVPredicate(ID, P_Compare), Pred(Pred), LHS(LHS), RHS(RHS) { assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match"); assert(LHS != RHS && "LHS and RHS are the same SCEV"); } bool SCEVComparePredicate::implies(const SCEVPredicate *N) const { const auto *Op = dyn_cast(N); if (!Op) return false; if (Pred != ICmpInst::ICMP_EQ) return false; return Op->LHS == LHS && Op->RHS == RHS; } bool SCEVComparePredicate::isAlwaysTrue() const { return false; } void SCEVComparePredicate::print(raw_ostream &OS, unsigned Depth) const { if (Pred == ICmpInst::ICMP_EQ) OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n"; else OS.indent(Depth) << "Compare predicate: " << *LHS << " " << Pred << ") " << *RHS << "\n"; } SCEVWrapPredicate::SCEVWrapPredicate(const FoldingSetNodeIDRef ID, const SCEVAddRecExpr *AR, IncrementWrapFlags Flags) : SCEVPredicate(ID, P_Wrap), AR(AR), Flags(Flags) {} const SCEVAddRecExpr *SCEVWrapPredicate::getExpr() const { return AR; } bool SCEVWrapPredicate::implies(const SCEVPredicate *N) const { const auto *Op = dyn_cast(N); return Op && Op->AR == AR && setFlags(Flags, Op->Flags) == Flags; } bool SCEVWrapPredicate::isAlwaysTrue() const { SCEV::NoWrapFlags ScevFlags = AR->getNoWrapFlags(); IncrementWrapFlags IFlags = Flags; if (ScalarEvolution::setFlags(ScevFlags, SCEV::FlagNSW) == ScevFlags) IFlags = clearFlags(IFlags, IncrementNSSW); return IFlags == IncrementAnyWrap; } void SCEVWrapPredicate::print(raw_ostream &OS, unsigned Depth) const { OS.indent(Depth) << *getExpr() << " Added Flags: "; if (SCEVWrapPredicate::IncrementNUSW & getFlags()) OS << ""; if (SCEVWrapPredicate::IncrementNSSW & getFlags()) OS << ""; OS << "\n"; } SCEVWrapPredicate::IncrementWrapFlags SCEVWrapPredicate::getImpliedFlags(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { IncrementWrapFlags ImpliedFlags = IncrementAnyWrap; SCEV::NoWrapFlags StaticFlags = AR->getNoWrapFlags(); // We can safely transfer the NSW flag as NSSW. if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNSW) == StaticFlags) ImpliedFlags = IncrementNSSW; if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNUW) == StaticFlags) { // If the increment is positive, the SCEV NUW flag will also imply the // WrapPredicate NUSW flag. if (const auto *Step = dyn_cast(AR->getStepRecurrence(SE))) if (Step->getValue()->getValue().isNonNegative()) ImpliedFlags = setFlags(ImpliedFlags, IncrementNUSW); } return ImpliedFlags; } /// Union predicates don't get cached so create a dummy set ID for it. SCEVUnionPredicate::SCEVUnionPredicate(ArrayRef Preds) : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) { for (const auto *P : Preds) add(P); } bool SCEVUnionPredicate::isAlwaysTrue() const { return all_of(Preds, [](const SCEVPredicate *I) { return I->isAlwaysTrue(); }); } bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const { if (const auto *Set = dyn_cast(N)) return all_of(Set->Preds, [this](const SCEVPredicate *I) { return this->implies(I); }); return any_of(Preds, [N](const SCEVPredicate *I) { return I->implies(N); }); } void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const { for (const auto *Pred : Preds) Pred->print(OS, Depth); } void SCEVUnionPredicate::add(const SCEVPredicate *N) { if (const auto *Set = dyn_cast(N)) { for (const auto *Pred : Set->Preds) add(Pred); return; } Preds.push_back(N); } PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE, Loop &L) : SE(SE), L(L) { SmallVector Empty; Preds = std::make_unique(Empty); } void ScalarEvolution::registerUser(const SCEV *User, ArrayRef Ops) { for (const auto *Op : Ops) // We do not expect that forgetting cached data for SCEVConstants will ever // open any prospects for sharpening or introduce any correctness issues, // so we don't bother storing their dependencies. if (!isa(Op)) SCEVUsers[Op].insert(User); } const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) { const SCEV *Expr = SE.getSCEV(V); RewriteEntry &Entry = RewriteMap[Expr]; // If we already have an entry and the version matches, return it. if (Entry.second && Generation == Entry.first) return Entry.second; // We found an entry but it's stale. Rewrite the stale entry // according to the current predicate. if (Entry.second) Expr = Entry.second; const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, &L, *Preds); Entry = {Generation, NewSCEV}; return NewSCEV; } const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() { if (!BackedgeCount) { SmallVector Preds; BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, Preds); for (const auto *P : Preds) addPredicate(*P); } return BackedgeCount; } void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds->implies(&Pred)) return; auto &OldPreds = Preds->getPredicates(); SmallVector NewPreds(OldPreds.begin(), OldPreds.end()); NewPreds.push_back(&Pred); Preds = std::make_unique(NewPreds); updateGeneration(); } const SCEVPredicate &PredicatedScalarEvolution::getPredicate() const { return *Preds; } void PredicatedScalarEvolution::updateGeneration() { // If the generation number wrapped recompute everything. if (++Generation == 0) { for (auto &II : RewriteMap) { const SCEV *Rewritten = II.second.second; II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, &L, *Preds)}; } } } void PredicatedScalarEvolution::setNoOverflow( Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) { const SCEV *Expr = getSCEV(V); const auto *AR = cast(Expr); auto ImpliedFlags = SCEVWrapPredicate::getImpliedFlags(AR, SE); // Clear the statically implied flags. Flags = SCEVWrapPredicate::clearFlags(Flags, ImpliedFlags); addPredicate(*SE.getWrapPredicate(AR, Flags)); auto II = FlagsMap.insert({V, Flags}); if (!II.second) II.first->second = SCEVWrapPredicate::setFlags(Flags, II.first->second); } bool PredicatedScalarEvolution::hasNoOverflow( Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) { const SCEV *Expr = getSCEV(V); const auto *AR = cast(Expr); Flags = SCEVWrapPredicate::clearFlags( Flags, SCEVWrapPredicate::getImpliedFlags(AR, SE)); auto II = FlagsMap.find(V); if (II != FlagsMap.end()) Flags = SCEVWrapPredicate::clearFlags(Flags, II->second); return Flags == SCEVWrapPredicate::IncrementAnyWrap; } const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) { const SCEV *Expr = this->getSCEV(V); SmallPtrSet NewPreds; auto *New = SE.convertSCEVToAddRecWithPredicates(Expr, &L, NewPreds); if (!New) return nullptr; for (const auto *P : NewPreds) addPredicate(*P); RewriteMap[SE.getSCEV(V)] = {Generation, New}; return New; } PredicatedScalarEvolution::PredicatedScalarEvolution( const PredicatedScalarEvolution &Init) : RewriteMap(Init.RewriteMap), SE(Init.SE), L(Init.L), Preds(std::make_unique(Init.Preds->getPredicates())), Generation(Init.Generation), BackedgeCount(Init.BackedgeCount) { for (auto I : Init.FlagsMap) FlagsMap.insert(I); } void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const { // For each block. for (auto *BB : L.getBlocks()) for (auto &I : *BB) { if (!SE.isSCEVable(I.getType())) continue; auto *Expr = SE.getSCEV(&I); auto II = RewriteMap.find(Expr); if (II == RewriteMap.end()) continue; // Don't print things that are not interesting. if (II->second.second == Expr) continue; OS.indent(Depth) << "[PSE]" << I << ":\n"; OS.indent(Depth + 2) << *Expr << "\n"; OS.indent(Depth + 2) << "--> " << *II->second.second << "\n"; } } // Match the mathematical pattern A - (A / B) * B, where A and B can be // arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used // for URem with constant power-of-2 second operands. // It's not always easy, as A and B can be folded (imagine A is X / 2, and B is // 4, A / B becomes X / 8). bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS) { // Try to match 'zext (trunc A to iB) to iY', which is used // for URem with constant power-of-2 second operands. Make sure the size of // the operand A matches the size of the whole expressions. if (const auto *ZExt = dyn_cast(Expr)) if (const auto *Trunc = dyn_cast(ZExt->getOperand(0))) { LHS = Trunc->getOperand(); // Bail out if the type of the LHS is larger than the type of the // expression for now. if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(Expr->getType())) return false; if (LHS->getType() != Expr->getType()) LHS = getZeroExtendExpr(LHS, Expr->getType()); RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1) << getTypeSizeInBits(Trunc->getType())); return true; } const auto *Add = dyn_cast(Expr); if (Add == nullptr || Add->getNumOperands() != 2) return false; const SCEV *A = Add->getOperand(1); const auto *Mul = dyn_cast(Add->getOperand(0)); if (Mul == nullptr) return false; const auto MatchURemWithDivisor = [&](const SCEV *B) { // (SomeExpr + (-(SomeExpr / B) * B)). if (Expr == getURemExpr(A, B)) { LHS = A; RHS = B; return true; } return false; }; // (SomeExpr + (-1 * (SomeExpr / B) * B)). if (Mul->getNumOperands() == 3 && isa(Mul->getOperand(0))) return MatchURemWithDivisor(Mul->getOperand(1)) || MatchURemWithDivisor(Mul->getOperand(2)); // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)). if (Mul->getNumOperands() == 2) return MatchURemWithDivisor(Mul->getOperand(1)) || MatchURemWithDivisor(Mul->getOperand(0)) || MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) || MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0))); return false; } const SCEV * ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); // Form an expression for the maximum exit count possible for this loop. We // merge the max and exact information to approximate a version of // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. SmallVector ExitCounts; for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExitCount = getExitCount(L, ExitingBB, ScalarEvolution::SymbolicMaximum); if (!isa(ExitCount)) { assert(DT.dominates(ExitingBB, L->getLoopLatch()) && "We should only have known counts for exiting blocks that " "dominate latch!"); ExitCounts.push_back(ExitCount); } } if (ExitCounts.empty()) return getCouldNotCompute(); return getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true); } /// A rewriter to replace SCEV expressions in Map with the corresponding entry /// in the map. It skips AddRecExpr because we cannot guarantee that the /// replacement is loop invariant in the loop of the AddRec. class SCEVLoopGuardRewriter : public SCEVRewriteVisitor { const DenseMap ⤅ public: SCEVLoopGuardRewriter(ScalarEvolution &SE, DenseMap &M) : SCEVRewriteVisitor(SE), Map(M) {} const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { return Expr; } const SCEV *visitUnknown(const SCEVUnknown *Expr) { auto I = Map.find(Expr); if (I == Map.end()) return Expr; return I->second; } const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { auto I = Map.find(Expr); if (I == Map.end()) { // If we didn't find the extact ZExt expr in the map, check if there's an // entry for a smaller ZExt we can use instead. Type *Ty = Expr->getType(); const SCEV *Op = Expr->getOperand(0); unsigned Bitwidth = Ty->getScalarSizeInBits() / 2; while (Bitwidth % 8 == 0 && Bitwidth >= 8 && Bitwidth > Op->getType()->getScalarSizeInBits()) { Type *NarrowTy = IntegerType::get(SE.getContext(), Bitwidth); auto *NarrowExt = SE.getZeroExtendExpr(Op, NarrowTy); auto I = Map.find(NarrowExt); if (I != Map.end()) return SE.getZeroExtendExpr(I->second, Ty); Bitwidth = Bitwidth / 2; } return SCEVRewriteVisitor::visitZeroExtendExpr( Expr); } return I->second; } const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { auto I = Map.find(Expr); if (I == Map.end()) return SCEVRewriteVisitor::visitSignExtendExpr( Expr); return I->second; } const SCEV *visitUMinExpr(const SCEVUMinExpr *Expr) { auto I = Map.find(Expr); if (I == Map.end()) return SCEVRewriteVisitor::visitUMinExpr(Expr); return I->second; } const SCEV *visitSMinExpr(const SCEVSMinExpr *Expr) { auto I = Map.find(Expr); if (I == Map.end()) return SCEVRewriteVisitor::visitSMinExpr(Expr); return I->second; } }; const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) { SmallVector ExprsToRewrite; auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, DenseMap &RewriteMap) { // WARNING: It is generally unsound to apply any wrap flags to the proposed // replacement SCEV which isn't directly implied by the structure of that // SCEV. In particular, using contextual facts to imply flags is *NOT* // legal. See the scoping rules for flags in the header to understand why. // If LHS is a constant, apply information to the other expression. if (isa(LHS)) { std::swap(LHS, RHS); Predicate = CmpInst::getSwappedPredicate(Predicate); } // Check for a condition of the form (-C1 + X < C2). InstCombine will // create this form when combining two checks of the form (X u< C2 + C1) and // (X >=u C1). auto MatchRangeCheckIdiom = [this, Predicate, LHS, RHS, &RewriteMap, &ExprsToRewrite]() { auto *AddExpr = dyn_cast(LHS); if (!AddExpr || AddExpr->getNumOperands() != 2) return false; auto *C1 = dyn_cast(AddExpr->getOperand(0)); auto *LHSUnknown = dyn_cast(AddExpr->getOperand(1)); auto *C2 = dyn_cast(RHS); if (!C1 || !C2 || !LHSUnknown) return false; auto ExactRegion = ConstantRange::makeExactICmpRegion(Predicate, C2->getAPInt()) .sub(C1->getAPInt()); // Bail out, unless we have a non-wrapping, monotonic range. if (ExactRegion.isWrappedSet() || ExactRegion.isFullSet()) return false; auto I = RewriteMap.find(LHSUnknown); const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : LHSUnknown; RewriteMap[LHSUnknown] = getUMaxExpr( getConstant(ExactRegion.getUnsignedMin()), getUMinExpr(RewrittenLHS, getConstant(ExactRegion.getUnsignedMax()))); ExprsToRewrite.push_back(LHSUnknown); return true; }; if (MatchRangeCheckIdiom()) return; // Return true if \p Expr is a MinMax SCEV expression with a non-negative // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS // the non-constant operand and in \p LHS the constant operand. auto IsMinMaxSCEVWithNonNegativeConstant = [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS, const SCEV *&RHS) { if (auto *MinMax = dyn_cast(Expr)) { if (MinMax->getNumOperands() != 2) return false; if (auto *C = dyn_cast(MinMax->getOperand(0))) { if (C->getAPInt().isNegative()) return false; SCTy = MinMax->getSCEVType(); LHS = MinMax->getOperand(0); RHS = MinMax->getOperand(1); return true; } } return false; }; // Checks whether Expr is a non-negative constant, and Divisor is a positive // constant, and returns their APInt in ExprVal and in DivisorVal. auto GetNonNegExprAndPosDivisor = [&](const SCEV *Expr, const SCEV *Divisor, APInt &ExprVal, APInt &DivisorVal) { auto *ConstExpr = dyn_cast(Expr); auto *ConstDivisor = dyn_cast(Divisor); if (!ConstExpr || !ConstDivisor) return false; ExprVal = ConstExpr->getAPInt(); DivisorVal = ConstDivisor->getAPInt(); return ExprVal.isNonNegative() && !DivisorVal.isNonPositive(); }; // Return a new SCEV that modifies \p Expr to the closest number divides by // \p Divisor and greater or equal than Expr. // For now, only handle constant Expr and Divisor. auto GetNextSCEVDividesByDivisor = [&](const SCEV *Expr, const SCEV *Divisor) { APInt ExprVal; APInt DivisorVal; if (!GetNonNegExprAndPosDivisor(Expr, Divisor, ExprVal, DivisorVal)) return Expr; APInt Rem = ExprVal.urem(DivisorVal); if (!Rem.isZero()) // return the SCEV: Expr + Divisor - Expr % Divisor return getConstant(ExprVal + DivisorVal - Rem); return Expr; }; // Return a new SCEV that modifies \p Expr to the closest number divides by // \p Divisor and less or equal than Expr. // For now, only handle constant Expr and Divisor. auto GetPreviousSCEVDividesByDivisor = [&](const SCEV *Expr, const SCEV *Divisor) { APInt ExprVal; APInt DivisorVal; if (!GetNonNegExprAndPosDivisor(Expr, Divisor, ExprVal, DivisorVal)) return Expr; APInt Rem = ExprVal.urem(DivisorVal); // return the SCEV: Expr - Expr % Divisor return getConstant(ExprVal - Rem); }; // Apply divisibilty by \p Divisor on MinMaxExpr with constant values, // recursively. This is done by aligning up/down the constant value to the // Divisor. std::function ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr, const SCEV *Divisor) { const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr; SCEVTypes SCTy; if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS, MinMaxRHS)) return MinMaxExpr; auto IsMin = isa(MinMaxExpr) || isa(MinMaxExpr); assert(isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!"); auto *DivisibleExpr = IsMin ? GetPreviousSCEVDividesByDivisor(MinMaxLHS, Divisor) : GetNextSCEVDividesByDivisor(MinMaxLHS, Divisor); SmallVector Ops = { ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr}; return getMinMaxExpr(SCTy, Ops); }; // If we have LHS == 0, check if LHS is computing a property of some unknown // SCEV %v which we can rewrite %v to express explicitly. const SCEVConstant *RHSC = dyn_cast(RHS); if (Predicate == CmpInst::ICMP_EQ && RHSC && RHSC->getValue()->isNullValue()) { // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to // explicitly express that. const SCEV *URemLHS = nullptr; const SCEV *URemRHS = nullptr; if (matchURem(LHS, URemLHS, URemRHS)) { if (const SCEVUnknown *LHSUnknown = dyn_cast(URemLHS)) { auto I = RewriteMap.find(LHSUnknown); const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : LHSUnknown; RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS); const auto *Multiple = getMulExpr(getUDivExpr(RewrittenLHS, URemRHS), URemRHS); RewriteMap[LHSUnknown] = Multiple; ExprsToRewrite.push_back(LHSUnknown); return; } } } // Do not apply information for constants or if RHS contains an AddRec. if (isa(LHS) || containsAddRecurrence(RHS)) return; // If RHS is SCEVUnknown, make sure the information is applied to it. if (!isa(LHS) && isa(RHS)) { std::swap(LHS, RHS); Predicate = CmpInst::getSwappedPredicate(Predicate); } // Puts rewrite rule \p From -> \p To into the rewrite map. Also if \p From // and \p FromRewritten are the same (i.e. there has been no rewrite // registered for \p From), then puts this value in the list of rewritten // expressions. auto AddRewrite = [&](const SCEV *From, const SCEV *FromRewritten, const SCEV *To) { if (From == FromRewritten) ExprsToRewrite.push_back(From); RewriteMap[From] = To; }; // Checks whether \p S has already been rewritten. In that case returns the // existing rewrite because we want to chain further rewrites onto the // already rewritten value. Otherwise returns \p S. auto GetMaybeRewritten = [&](const SCEV *S) { auto I = RewriteMap.find(S); return I != RewriteMap.end() ? I->second : S; }; // Check for the SCEV expression (A /u B) * B while B is a constant, inside // \p Expr. The check is done recuresively on \p Expr, which is assumed to // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A // /u B) * B was found, and return the divisor B in \p DividesBy. For // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p // DividesBy. std::function HasDivisibiltyInfo = [&](const SCEV *Expr, const SCEV *&DividesBy) { if (auto *Mul = dyn_cast(Expr)) { if (Mul->getNumOperands() != 2) return false; auto *MulLHS = Mul->getOperand(0); auto *MulRHS = Mul->getOperand(1); if (isa(MulLHS)) std::swap(MulLHS, MulRHS); if (auto *Div = dyn_cast(MulLHS)) if (Div->getOperand(1) == MulRHS) { DividesBy = MulRHS; return true; } } if (auto *MinMax = dyn_cast(Expr)) return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) || HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy); return false; }; // Return true if Expr known to divide by \p DividesBy. std::function IsKnownToDivideBy = [&](const SCEV *Expr, const SCEV *DividesBy) { if (getURemExpr(Expr, DividesBy)->isZero()) return true; if (auto *MinMax = dyn_cast(Expr)) return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) && IsKnownToDivideBy(MinMax->getOperand(1), DividesBy); return false; }; const SCEV *RewrittenLHS = GetMaybeRewritten(LHS); const SCEV *DividesBy = nullptr; if (HasDivisibiltyInfo(RewrittenLHS, DividesBy)) // Check that the whole expression is divided by DividesBy DividesBy = IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr; // Collect rewrites for LHS and its transitive operands based on the // condition. // For min/max expressions, also apply the guard to its operands: // 'min(a, b) >= c' -> '(a >= c) and (b >= c)', // 'min(a, b) > c' -> '(a > c) and (b > c)', // 'max(a, b) <= c' -> '(a <= c) and (b <= c)', // 'max(a, b) < c' -> '(a < c) and (b < c)'. // We cannot express strict predicates in SCEV, so instead we replace them // with non-strict ones against plus or minus one of RHS depending on the // predicate. const SCEV *One = getOne(RHS->getType()); switch (Predicate) { case CmpInst::ICMP_ULT: if (RHS->getType()->isPointerTy()) return; RHS = getUMaxExpr(RHS, One); [[fallthrough]]; case CmpInst::ICMP_SLT: { RHS = getMinusSCEV(RHS, One); RHS = DividesBy ? GetPreviousSCEVDividesByDivisor(RHS, DividesBy) : RHS; break; } case CmpInst::ICMP_UGT: case CmpInst::ICMP_SGT: RHS = getAddExpr(RHS, One); RHS = DividesBy ? GetNextSCEVDividesByDivisor(RHS, DividesBy) : RHS; break; case CmpInst::ICMP_ULE: case CmpInst::ICMP_SLE: RHS = DividesBy ? GetPreviousSCEVDividesByDivisor(RHS, DividesBy) : RHS; break; case CmpInst::ICMP_UGE: case CmpInst::ICMP_SGE: RHS = DividesBy ? GetNextSCEVDividesByDivisor(RHS, DividesBy) : RHS; break; default: break; } SmallVector Worklist(1, LHS); SmallPtrSet Visited; auto EnqueueOperands = [&Worklist](const SCEVNAryExpr *S) { append_range(Worklist, S->operands()); }; while (!Worklist.empty()) { const SCEV *From = Worklist.pop_back_val(); if (isa(From)) continue; if (!Visited.insert(From).second) continue; const SCEV *FromRewritten = GetMaybeRewritten(From); const SCEV *To = nullptr; switch (Predicate) { case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE: To = getUMinExpr(FromRewritten, RHS); if (auto *UMax = dyn_cast(FromRewritten)) EnqueueOperands(UMax); break; case CmpInst::ICMP_SLT: case CmpInst::ICMP_SLE: To = getSMinExpr(FromRewritten, RHS); if (auto *SMax = dyn_cast(FromRewritten)) EnqueueOperands(SMax); break; case CmpInst::ICMP_UGT: case CmpInst::ICMP_UGE: To = getUMaxExpr(FromRewritten, RHS); if (auto *UMin = dyn_cast(FromRewritten)) EnqueueOperands(UMin); break; case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: To = getSMaxExpr(FromRewritten, RHS); if (auto *SMin = dyn_cast(FromRewritten)) EnqueueOperands(SMin); break; case CmpInst::ICMP_EQ: if (isa(RHS)) To = RHS; break; case CmpInst::ICMP_NE: if (isa(RHS) && cast(RHS)->getValue()->isNullValue()) { const SCEV *OneAlignedUp = DividesBy ? GetNextSCEVDividesByDivisor(One, DividesBy) : One; To = getUMaxExpr(FromRewritten, OneAlignedUp); } break; default: break; } if (To) AddRewrite(From, FromRewritten, To); } }; BasicBlock *Header = L->getHeader(); SmallVector> Terms; // First, collect information from assumptions dominating the loop. for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *AssumeI = cast(AssumeVH); if (!DT.dominates(AssumeI, Header)) continue; Terms.emplace_back(AssumeI->getOperand(0), true); } // Second, collect information from llvm.experimental.guards dominating the loop. auto *GuardDecl = F.getParent()->getFunction( Intrinsic::getName(Intrinsic::experimental_guard)); if (GuardDecl) for (const auto *GU : GuardDecl->users()) if (const auto *Guard = dyn_cast(GU)) if (Guard->getFunction() == Header->getParent() && DT.dominates(Guard, Header)) Terms.emplace_back(Guard->getArgOperand(0), true); // Third, collect conditions from dominating branches. Starting at the loop // predecessor, climb up the predecessor chain, as long as there are // predecessors that can be found that have unique successors leading to the // original header. // TODO: share this logic with isLoopEntryGuardedByCond. for (std::pair Pair( L->getLoopPredecessor(), Header); Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { const BranchInst *LoopEntryPredicate = dyn_cast(Pair.first->getTerminator()); if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional()) continue; Terms.emplace_back(LoopEntryPredicate->getCondition(), LoopEntryPredicate->getSuccessor(0) == Pair.second); } // Now apply the information from the collected conditions to RewriteMap. // Conditions are processed in reverse order, so the earliest conditions is // processed first. This ensures the SCEVs with the shortest dependency chains // are constructed first. DenseMap RewriteMap; for (auto [Term, EnterIfTrue] : reverse(Terms)) { SmallVector Worklist; SmallPtrSet Visited; Worklist.push_back(Term); while (!Worklist.empty()) { Value *Cond = Worklist.pop_back_val(); if (!Visited.insert(Cond).second) continue; if (auto *Cmp = dyn_cast(Cond)) { auto Predicate = EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate(); const auto *LHS = getSCEV(Cmp->getOperand(0)); const auto *RHS = getSCEV(Cmp->getOperand(1)); CollectCondition(Predicate, LHS, RHS, RewriteMap); continue; } Value *L, *R; if (EnterIfTrue ? match(Cond, m_LogicalAnd(m_Value(L), m_Value(R))) : match(Cond, m_LogicalOr(m_Value(L), m_Value(R)))) { Worklist.push_back(L); Worklist.push_back(R); } } } if (RewriteMap.empty()) return Expr; // Now that all rewrite information is collect, rewrite the collected // expressions with the information in the map. This applies information to // sub-expressions. if (ExprsToRewrite.size() > 1) { for (const SCEV *Expr : ExprsToRewrite) { const SCEV *RewriteTo = RewriteMap[Expr]; RewriteMap.erase(Expr); SCEVLoopGuardRewriter Rewriter(*this, RewriteMap); RewriteMap.insert({Expr, Rewriter.visit(RewriteTo)}); } } SCEVLoopGuardRewriter Rewriter(*this, RewriteMap); return Rewriter.visit(Expr); } diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 952f454f8f6a..7979ac9a5fb7 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -1,2067 +1,2077 @@ //===- ComplexDeinterleavingPass.cpp --------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Identification: // This step is responsible for finding the patterns that can be lowered to // complex instructions, and building a graph to represent the complex // structures. Starting from the "Converging Shuffle" (a shuffle that // reinterleaves the complex components, with a mask of <0, 2, 1, 3>), the // operands are evaluated and identified as "Composite Nodes" (collections of // instructions that can potentially be lowered to a single complex // instruction). This is performed by checking the real and imaginary components // and tracking the data flow for each component while following the operand // pairs. Validity of each node is expected to be done upon creation, and any // validation errors should halt traversal and prevent further graph // construction. // Instead of relying on Shuffle operations, vector interleaving and // deinterleaving can be represented by vector.interleave2 and // vector.deinterleave2 intrinsics. Scalable vectors can be represented only by // these intrinsics, whereas, fixed-width vectors are recognized for both // shufflevector instruction and intrinsics. // // Replacement: // This step traverses the graph built up by identification, delegating to the // target to validate and generate the correct intrinsics, and plumbs them // together connecting each end of the new intrinsics graph to the existing // use-def chain. This step is assumed to finish successfully, as all // information is expected to be correct by this point. // // // Internal data structure: // ComplexDeinterleavingGraph: // Keeps references to all the valid CompositeNodes formed as part of the // transformation, and every Instruction contained within said nodes. It also // holds onto a reference to the root Instruction, and the root node that should // replace it. // // ComplexDeinterleavingCompositeNode: // A CompositeNode represents a single transformation point; each node should // transform into a single complex instruction (ignoring vector splitting, which // would generate more instructions per node). They are identified in a // depth-first manner, traversing and identifying the operands of each // instruction in the order they appear in the IR. // Each node maintains a reference to its Real and Imaginary instructions, // as well as any additional instructions that make up the identified operation // (Internal instructions should only have uses within their containing node). // A Node also contains the rotation and operation type that it represents. // Operands contains pointers to other CompositeNodes, acting as the edges in // the graph. ReplacementValue is the transformed Value* that has been emitted // to the IR. // // Note: If the operation of a Node is Shuffle, only the Real, Imaginary, and // ReplacementValue fields of that Node are relevant, where the ReplacementValue // should be pre-populated. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Local.h" #include using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "complex-deinterleaving" STATISTIC(NumComplexTransformations, "Amount of complex patterns transformed"); static cl::opt ComplexDeinterleavingEnabled( "enable-complex-deinterleaving", cl::desc("Enable generation of complex instructions"), cl::init(true), cl::Hidden); /// Checks the given mask, and determines whether said mask is interleaving. /// /// To be interleaving, a mask must alternate between `i` and `i + (Length / /// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a /// 4x vector interleaving mask would be <0, 2, 1, 3>). static bool isInterleavingMask(ArrayRef Mask); /// Checks the given mask, and determines whether said mask is deinterleaving. /// /// To be deinterleaving, a mask must increment in steps of 2, and either start /// with 0 or 1. /// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or /// <1, 3, 5, 7>). static bool isDeinterleavingMask(ArrayRef Mask); /// Returns true if the operation is a negation of V, and it works for both /// integers and floats. static bool isNeg(Value *V); /// Returns the operand for negation operation. static Value *getNegOperand(Value *V); namespace { class ComplexDeinterleavingLegacyPass : public FunctionPass { public: static char ID; ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(TM) { initializeComplexDeinterleavingLegacyPassPass( *PassRegistry::getPassRegistry()); } StringRef getPassName() const override { return "Complex Deinterleaving Pass"; } bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesCFG(); } private: const TargetMachine *TM; }; class ComplexDeinterleavingGraph; struct ComplexDeinterleavingCompositeNode { ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op, Value *R, Value *I) : Operation(Op), Real(R), Imag(I) {} private: friend class ComplexDeinterleavingGraph; using NodePtr = std::shared_ptr; using RawNodePtr = ComplexDeinterleavingCompositeNode *; public: ComplexDeinterleavingOperation Operation; Value *Real; Value *Imag; // This two members are required exclusively for generating // ComplexDeinterleavingOperation::Symmetric operations. unsigned Opcode; std::optional Flags; ComplexDeinterleavingRotation Rotation = ComplexDeinterleavingRotation::Rotation_0; SmallVector Operands; Value *ReplacementNode = nullptr; void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { auto PrintValue = [&](Value *V) { if (V) { OS << "\""; V->print(OS, true); OS << "\"\n"; } else OS << "nullptr\n"; }; auto PrintNodeRef = [&](RawNodePtr Ptr) { if (Ptr) OS << Ptr << "\n"; else OS << "nullptr\n"; }; OS << "- CompositeNode: " << this << "\n"; OS << " Real: "; PrintValue(Real); OS << " Imag: "; PrintValue(Imag); OS << " ReplacementNode: "; PrintValue(ReplacementNode); OS << " Operation: " << (int)Operation << "\n"; OS << " Rotation: " << ((int)Rotation * 90) << "\n"; OS << " Operands: \n"; for (const auto &Op : Operands) { OS << " - "; PrintNodeRef(Op); } } }; class ComplexDeinterleavingGraph { public: struct Product { Value *Multiplier; Value *Multiplicand; bool IsPositive; }; using Addend = std::pair; using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr; // Helper struct for holding info about potential partial multiplication // candidates struct PartialMulCandidate { Value *Common; NodePtr Node; unsigned RealIdx; unsigned ImagIdx; bool IsNodeInverted; }; explicit ComplexDeinterleavingGraph(const TargetLowering *TL, const TargetLibraryInfo *TLI) : TL(TL), TLI(TLI) {} private: const TargetLowering *TL = nullptr; const TargetLibraryInfo *TLI = nullptr; SmallVector CompositeNodes; DenseMap, NodePtr> CachedResult; SmallPtrSet FinalInstructions; /// Root instructions are instructions from which complex computation starts std::map RootToNode; /// Topologically sorted root instructions SmallVector OrderedRoots; /// When examining a basic block for complex deinterleaving, if it is a simple /// one-block loop, then the only incoming block is 'Incoming' and the /// 'BackEdge' block is the block itself." BasicBlock *BackEdge = nullptr; BasicBlock *Incoming = nullptr; /// ReductionInfo maps from %ReductionOp to %PHInode and Instruction /// %OutsideUser as it is shown in the IR: /// /// vector.body: /// %PHInode = phi [ zeroinitializer, %entry ], /// [ %ReductionOp, %vector.body ] /// ... /// %ReductionOp = fadd i64 ... /// ... /// br i1 %condition, label %vector.body, %middle.block /// /// middle.block: /// %OutsideUser = llvm.vector.reduce.fadd(..., %ReductionOp) /// /// %OutsideUser can be `llvm.vector.reduce.fadd` or `fadd` preceding /// `llvm.vector.reduce.fadd` when unroll factor isn't one. std::map> ReductionInfo; /// In the process of detecting a reduction, we consider a pair of /// %ReductionOP, which we refer to as real and imag (or vice versa), and /// traverse the use-tree to detect complex operations. As this is a reduction /// operation, it will eventually reach RealPHI and ImagPHI, which corresponds /// to the %ReductionOPs that we suspect to be complex. /// RealPHI and ImagPHI are used by the identifyPHINode method. PHINode *RealPHI = nullptr; PHINode *ImagPHI = nullptr; /// Set this flag to true if RealPHI and ImagPHI were reached during reduction /// detection. bool PHIsFound = false; /// OldToNewPHI maps the original real PHINode to a new, double-sized PHINode. /// The new PHINode corresponds to a vector of deinterleaved complex numbers. /// This mapping is populated during /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then /// used in the ComplexDeinterleavingOperation::ReductionOperation node /// replacement process. std::map OldToNewPHI; NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation, Value *R, Value *I) { assert(((Operation != ComplexDeinterleavingOperation::ReductionPHI && Operation != ComplexDeinterleavingOperation::ReductionOperation) || (R && I)) && "Reduction related nodes must have Real and Imaginary parts"); return std::make_shared(Operation, R, I); } NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); if (Node->Real && Node->Imag) CachedResult[{Node->Real, Node->Imag}] = Node; return Node; } /// Identifies a complex partial multiply pattern and its rotation, based on /// the following patterns /// /// 0: r: cr + ar * br /// i: ci + ar * bi /// 90: r: cr - ai * bi /// i: ci + ai * br /// 180: r: cr - ar * br /// i: ci - ar * bi /// 270: r: cr + ai * bi /// i: ci - ai * br NodePtr identifyPartialMul(Instruction *Real, Instruction *Imag); /// Identify the other branch of a Partial Mul, taking the CommonOperandI that /// is partially known from identifyPartialMul, filling in the other half of /// the complex pair. NodePtr identifyNodeWithImplicitAdd(Instruction *I, Instruction *J, std::pair &CommonOperandI); /// Identifies a complex add pattern and its rotation, based on the following /// patterns. /// /// 90: r: ar - bi /// i: ai + br /// 270: r: ar + bi /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); NodePtr identifyNode(Value *R, Value *I); /// Determine if a sum of complex numbers can be formed from \p RealAddends /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. /// Return nullptr if it is not possible to construct a complex number. /// \p Flags are needed to generate symmetric Add and Sub operations. NodePtr identifyAdditions(std::list &RealAddends, std::list &ImagAddends, std::optional Flags, NodePtr Accumulator); /// Extract one addend that have both real and imaginary parts positive. NodePtr extractPositiveAddend(std::list &RealAddends, std::list &ImagAddends); /// Determine if sum of multiplications of complex numbers can be formed from /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result /// to it. Return nullptr if it is not possible to construct a complex number. NodePtr identifyMultiplications(std::vector &RealMuls, std::vector &ImagMuls, NodePtr Accumulator); /// Go through pairs of multiplication (one Real and one Imag) and find all /// possible candidates for partial multiplication and put them into \p /// Candidates. Returns true if all Product has pair with common operand bool collectPartialMuls(const std::vector &RealMuls, const std::vector &ImagMuls, std::vector &Candidates); /// If the code is compiled with -Ofast or expressions have `reassoc` flag, /// the order of complex computation operations may be significantly altered, /// and the real and imaginary parts may not be executed in parallel. This /// function takes this into consideration and employs a more general approach /// to identify complex computations. Initially, it gathers all the addends /// and multiplicands and then constructs a complex expression from them. NodePtr identifyReassocNodes(Instruction *I, Instruction *J); NodePtr identifyRoot(Instruction *I); /// Identifies the Deinterleave operation applied to a vector containing /// complex numbers. There are two ways to represent the Deinterleave /// operation: /// * Using two shufflevectors with even indices for /pReal instruction and /// odd indices for /pImag instructions (only for fixed-width vectors) /// * Using two extractvalue instructions applied to `vector.deinterleave2` /// intrinsic (for both fixed and scalable vectors) NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); /// identifying the operation that represents a complex number repeated in a /// Splat vector. There are two possible types of splats: ConstantExpr with /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an /// initialization mask with all values set to zero. NodePtr identifySplat(Value *Real, Value *Imag); NodePtr identifyPHINode(Instruction *Real, Instruction *Imag); /// Identifies SelectInsts in a loop that has reduction with predication masks /// and/or predicated tail folding NodePtr identifySelectNode(Instruction *Real, Instruction *Imag); Value *replaceNode(IRBuilderBase &Builder, RawNodePtr Node); /// Complete IR modifications after producing new reduction operation: /// * Populate the PHINode generated for /// ComplexDeinterleavingOperation::ReductionPHI /// * Deinterleave the final value outside of the loop and repurpose original /// reduction users void processReductionOperation(Value *OperationReplacement, RawNodePtr Node); public: void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { for (const auto &Node : CompositeNodes) Node->dump(OS); } /// Returns false if the deinterleaving operation should be cancelled for the /// current graph. bool identifyNodes(Instruction *RootI); /// In case \pB is one-block loop, this function seeks potential reductions /// and populates ReductionInfo. Returns true if any reductions were /// identified. bool collectPotentialReductions(BasicBlock *B); void identifyReductionNodes(); /// Check that every instruction, from the roots to the leaves, has internal /// uses. bool checkNodes(); /// Perform the actual replacement of the underlying instruction graph. void replaceNodes(); }; class ComplexDeinterleaving { public: ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli) : TL(tl), TLI(tli) {} bool runOnFunction(Function &F); private: bool evaluateBasicBlock(BasicBlock *B); const TargetLowering *TL = nullptr; const TargetLibraryInfo *TLI = nullptr; }; } // namespace char ComplexDeinterleavingLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, "Complex Deinterleaving", false, false) INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, "Complex Deinterleaving", false, false) PreservedAnalyses ComplexDeinterleavingPass::run(Function &F, FunctionAnalysisManager &AM) { const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); auto &TLI = AM.getResult(F); if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve(); return PA; } FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) { return new ComplexDeinterleavingLegacyPass(TM); } bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) { const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); auto TLI = getAnalysis().getTLI(F); return ComplexDeinterleaving(TL, &TLI).runOnFunction(F); } bool ComplexDeinterleaving::runOnFunction(Function &F) { if (!ComplexDeinterleavingEnabled) { LLVM_DEBUG( dbgs() << "Complex deinterleaving has been explicitly disabled.\n"); return false; } if (!TL->isComplexDeinterleavingSupported()) { LLVM_DEBUG( dbgs() << "Complex deinterleaving has been disabled, target does " "not support lowering of complex number operations.\n"); return false; } bool Changed = false; for (auto &B : F) Changed |= evaluateBasicBlock(&B); return Changed; } static bool isInterleavingMask(ArrayRef Mask) { // If the size is not even, it's not an interleaving mask if ((Mask.size() & 1)) return false; int HalfNumElements = Mask.size() / 2; for (int Idx = 0; Idx < HalfNumElements; ++Idx) { int MaskIdx = Idx * 2; if (Mask[MaskIdx] != Idx || Mask[MaskIdx + 1] != (Idx + HalfNumElements)) return false; } return true; } static bool isDeinterleavingMask(ArrayRef Mask) { int Offset = Mask[0]; int HalfNumElements = Mask.size() / 2; for (int Idx = 1; Idx < HalfNumElements; ++Idx) { if (Mask[Idx] != (Idx * 2) + Offset) return false; } return true; } bool isNeg(Value *V) { return match(V, m_FNeg(m_Value())) || match(V, m_Neg(m_Value())); } Value *getNegOperand(Value *V) { assert(isNeg(V)); auto *I = cast(V); if (I->getOpcode() == Instruction::FNeg) return I->getOperand(0); return I->getOperand(1); } bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { ComplexDeinterleavingGraph Graph(TL, TLI); if (Graph.collectPotentialReductions(B)) Graph.identifyReductionNodes(); for (auto &I : *B) Graph.identifyNodes(&I); if (Graph.checkNodes()) { Graph.replaceNodes(); return true; } return false; } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( Instruction *Real, Instruction *Imag, std::pair &PartialMatch) { LLVM_DEBUG(dbgs() << "identifyNodeWithImplicitAdd " << *Real << " / " << *Imag << "\n"); if (!Real->hasOneUse() || !Imag->hasOneUse()) { LLVM_DEBUG(dbgs() << " - Mul operand has multiple uses.\n"); return nullptr; } if ((Real->getOpcode() != Instruction::FMul && Real->getOpcode() != Instruction::Mul) || (Imag->getOpcode() != Instruction::FMul && Imag->getOpcode() != Instruction::Mul)) { LLVM_DEBUG( dbgs() << " - Real or imaginary instruction is not fmul or mul\n"); return nullptr; } Value *R0 = Real->getOperand(0); Value *R1 = Real->getOperand(1); Value *I0 = Imag->getOperand(0); Value *I1 = Imag->getOperand(1); // A +/+ has a rotation of 0. If any of the operands are fneg, we flip the // rotations and use the operand. unsigned Negs = 0; Value *Op; if (match(R0, m_Neg(m_Value(Op)))) { Negs |= 1; R0 = Op; } else if (match(R1, m_Neg(m_Value(Op)))) { Negs |= 1; R1 = Op; } if (isNeg(I0)) { Negs |= 2; Negs ^= 1; I0 = Op; } else if (match(I1, m_Neg(m_Value(Op)))) { Negs |= 2; Negs ^= 1; I1 = Op; } ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation)Negs; Value *CommonOperand; Value *UncommonRealOp; Value *UncommonImagOp; if (R0 == I0 || R0 == I1) { CommonOperand = R0; UncommonRealOp = R1; } else if (R1 == I0 || R1 == I1) { CommonOperand = R1; UncommonRealOp = R0; } else { LLVM_DEBUG(dbgs() << " - No equal operand\n"); return nullptr; } UncommonImagOp = (CommonOperand == I0) ? I1 : I0; if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || Rotation == ComplexDeinterleavingRotation::Rotation_270) std::swap(UncommonRealOp, UncommonImagOp); // Between identifyPartialMul and here we need to have found a complete valid // pair from the CommonOperand of each part. if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || Rotation == ComplexDeinterleavingRotation::Rotation_180) PartialMatch.first = CommonOperand; else PartialMatch.second = CommonOperand; if (!PartialMatch.first || !PartialMatch.second) { LLVM_DEBUG(dbgs() << " - Incomplete partial match\n"); return nullptr; } NodePtr CommonNode = identifyNode(PartialMatch.first, PartialMatch.second); if (!CommonNode) { LLVM_DEBUG(dbgs() << " - No CommonNode identified\n"); return nullptr; } NodePtr UncommonNode = identifyNode(UncommonRealOp, UncommonImagOp); if (!UncommonNode) { LLVM_DEBUG(dbgs() << " - No UncommonNode identified\n"); return nullptr; } NodePtr Node = prepareCompositeNode( ComplexDeinterleavingOperation::CMulPartial, Real, Imag); Node->Rotation = Rotation; Node->addOperand(CommonNode); Node->addOperand(UncommonNode); return submitCompositeNode(Node); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, Instruction *Imag) { LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag << "\n"); // Determine rotation auto IsAdd = [](unsigned Op) { return Op == Instruction::FAdd || Op == Instruction::Add; }; auto IsSub = [](unsigned Op) { return Op == Instruction::FSub || Op == Instruction::Sub; }; ComplexDeinterleavingRotation Rotation; if (IsAdd(Real->getOpcode()) && IsAdd(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_0; else if (IsSub(Real->getOpcode()) && IsAdd(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_90; else if (IsSub(Real->getOpcode()) && IsSub(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_180; else if (IsAdd(Real->getOpcode()) && IsSub(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_270; else { LLVM_DEBUG(dbgs() << " - Unhandled rotation.\n"); return nullptr; } if (isa(Real) && (!Real->getFastMathFlags().allowContract() || !Imag->getFastMathFlags().allowContract())) { LLVM_DEBUG(dbgs() << " - Contract is missing from the FastMath flags.\n"); return nullptr; } Value *CR = Real->getOperand(0); Instruction *RealMulI = dyn_cast(Real->getOperand(1)); if (!RealMulI) return nullptr; Value *CI = Imag->getOperand(0); Instruction *ImagMulI = dyn_cast(Imag->getOperand(1)); if (!ImagMulI) return nullptr; if (!RealMulI->hasOneUse() || !ImagMulI->hasOneUse()) { LLVM_DEBUG(dbgs() << " - Mul instruction has multiple uses\n"); return nullptr; } Value *R0 = RealMulI->getOperand(0); Value *R1 = RealMulI->getOperand(1); Value *I0 = ImagMulI->getOperand(0); Value *I1 = ImagMulI->getOperand(1); Value *CommonOperand; Value *UncommonRealOp; Value *UncommonImagOp; if (R0 == I0 || R0 == I1) { CommonOperand = R0; UncommonRealOp = R1; } else if (R1 == I0 || R1 == I1) { CommonOperand = R1; UncommonRealOp = R0; } else { LLVM_DEBUG(dbgs() << " - No equal operand\n"); return nullptr; } UncommonImagOp = (CommonOperand == I0) ? I1 : I0; if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || Rotation == ComplexDeinterleavingRotation::Rotation_270) std::swap(UncommonRealOp, UncommonImagOp); std::pair PartialMatch( (Rotation == ComplexDeinterleavingRotation::Rotation_0 || Rotation == ComplexDeinterleavingRotation::Rotation_180) ? CommonOperand : nullptr, (Rotation == ComplexDeinterleavingRotation::Rotation_90 || Rotation == ComplexDeinterleavingRotation::Rotation_270) ? CommonOperand : nullptr); auto *CRInst = dyn_cast(CR); auto *CIInst = dyn_cast(CI); if (!CRInst || !CIInst) { LLVM_DEBUG(dbgs() << " - Common operands are not instructions.\n"); return nullptr; } NodePtr CNode = identifyNodeWithImplicitAdd(CRInst, CIInst, PartialMatch); if (!CNode) { LLVM_DEBUG(dbgs() << " - No cnode identified\n"); return nullptr; } NodePtr UncommonRes = identifyNode(UncommonRealOp, UncommonImagOp); if (!UncommonRes) { LLVM_DEBUG(dbgs() << " - No UncommonRes identified\n"); return nullptr; } assert(PartialMatch.first && PartialMatch.second); NodePtr CommonRes = identifyNode(PartialMatch.first, PartialMatch.second); if (!CommonRes) { LLVM_DEBUG(dbgs() << " - No CommonRes identified\n"); return nullptr; } NodePtr Node = prepareCompositeNode( ComplexDeinterleavingOperation::CMulPartial, Real, Imag); Node->Rotation = Rotation; Node->addOperand(CommonRes); Node->addOperand(UncommonRes); Node->addOperand(CNode); return submitCompositeNode(Node); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyAdd(Instruction *Real, Instruction *Imag) { LLVM_DEBUG(dbgs() << "identifyAdd " << *Real << " / " << *Imag << "\n"); // Determine rotation ComplexDeinterleavingRotation Rotation; if ((Real->getOpcode() == Instruction::FSub && Imag->getOpcode() == Instruction::FAdd) || (Real->getOpcode() == Instruction::Sub && Imag->getOpcode() == Instruction::Add)) Rotation = ComplexDeinterleavingRotation::Rotation_90; else if ((Real->getOpcode() == Instruction::FAdd && Imag->getOpcode() == Instruction::FSub) || (Real->getOpcode() == Instruction::Add && Imag->getOpcode() == Instruction::Sub)) Rotation = ComplexDeinterleavingRotation::Rotation_270; else { LLVM_DEBUG(dbgs() << " - Unhandled case, rotation is not assigned.\n"); return nullptr; } auto *AR = dyn_cast(Real->getOperand(0)); auto *BI = dyn_cast(Real->getOperand(1)); auto *AI = dyn_cast(Imag->getOperand(0)); auto *BR = dyn_cast(Imag->getOperand(1)); if (!AR || !AI || !BR || !BI) { LLVM_DEBUG(dbgs() << " - Not all operands are instructions.\n"); return nullptr; } NodePtr ResA = identifyNode(AR, AI); if (!ResA) { LLVM_DEBUG(dbgs() << " - AR/AI is not identified as a composite node.\n"); return nullptr; } NodePtr ResB = identifyNode(BR, BI); if (!ResB) { LLVM_DEBUG(dbgs() << " - BR/BI is not identified as a composite node.\n"); return nullptr; } NodePtr Node = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, Real, Imag); Node->Rotation = Rotation; Node->addOperand(ResA); Node->addOperand(ResB); return submitCompositeNode(Node); } static bool isInstructionPairAdd(Instruction *A, Instruction *B) { unsigned OpcA = A->getOpcode(); unsigned OpcB = B->getOpcode(); return (OpcA == Instruction::FSub && OpcB == Instruction::FAdd) || (OpcA == Instruction::FAdd && OpcB == Instruction::FSub) || (OpcA == Instruction::Sub && OpcB == Instruction::Add) || (OpcA == Instruction::Add && OpcB == Instruction::Sub); } static bool isInstructionPairMul(Instruction *A, Instruction *B) { auto Pattern = m_BinOp(m_FMul(m_Value(), m_Value()), m_FMul(m_Value(), m_Value())); return match(A, Pattern) && match(B, Pattern); } static bool isInstructionPotentiallySymmetric(Instruction *I) { switch (I->getOpcode()) { case Instruction::FAdd: case Instruction::FSub: case Instruction::FMul: case Instruction::FNeg: case Instruction::Add: case Instruction::Sub: case Instruction::Mul: return true; default: return false; } } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, Instruction *Imag) { if (Real->getOpcode() != Imag->getOpcode()) return nullptr; if (!isInstructionPotentiallySymmetric(Real) || !isInstructionPotentiallySymmetric(Imag)) return nullptr; auto *R0 = Real->getOperand(0); auto *I0 = Imag->getOperand(0); NodePtr Op0 = identifyNode(R0, I0); NodePtr Op1 = nullptr; if (Op0 == nullptr) return nullptr; if (Real->isBinaryOp()) { auto *R1 = Real->getOperand(1); auto *I1 = Imag->getOperand(1); Op1 = identifyNode(R1, I1); if (Op1 == nullptr) return nullptr; } if (isa(Real) && Real->getFastMathFlags() != Imag->getFastMathFlags()) return nullptr; auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, Real, Imag); Node->Opcode = Real->getOpcode(); if (isa(Real)) Node->Flags = Real->getFastMathFlags(); Node->addOperand(Op0); if (Real->isBinaryOp()) Node->addOperand(Op1); return submitCompositeNode(Node); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n"); assert(R->getType() == I->getType() && "Real and imaginary parts should not have different types"); auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); return It->second; } if (NodePtr CN = identifySplat(R, I)) return CN; auto *Real = dyn_cast(R); auto *Imag = dyn_cast(I); if (!Real || !Imag) return nullptr; if (NodePtr CN = identifyDeinterleave(Real, Imag)) return CN; if (NodePtr CN = identifyPHINode(Real, Imag)) return CN; if (NodePtr CN = identifySelectNode(Real, Imag)) return CN; auto *VTy = cast(Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation::CMulPartial, NewVTy); bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation::CAdd, NewVTy); if (HasCMulSupport && isInstructionPairMul(Real, Imag)) { if (NodePtr CN = identifyPartialMul(Real, Imag)) return CN; } if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) { if (NodePtr CN = identifyAdd(Real, Imag)) return CN; } if (HasCMulSupport && HasCAddSupport) { if (NodePtr CN = identifyReassocNodes(Real, Imag)) return CN; } if (NodePtr CN = identifySymmetricOperation(Real, Imag)) return CN; LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n"); CachedResult[{R, I}] = nullptr; return nullptr; } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, Instruction *Imag) { auto IsOperationSupported = [](unsigned Opcode) -> bool { return Opcode == Instruction::FAdd || Opcode == Instruction::FSub || Opcode == Instruction::FNeg || Opcode == Instruction::Add || Opcode == Instruction::Sub; }; if (!IsOperationSupported(Real->getOpcode()) || !IsOperationSupported(Imag->getOpcode())) return nullptr; std::optional Flags; if (isa(Real)) { if (Real->getFastMathFlags() != Imag->getFastMathFlags()) { LLVM_DEBUG(dbgs() << "The flags in Real and Imaginary instructions are " "not identical\n"); return nullptr; } Flags = Real->getFastMathFlags(); if (!Flags->allowReassoc()) { LLVM_DEBUG( dbgs() << "the 'Reassoc' attribute is missing in the FastMath flags\n"); return nullptr; } } // Collect multiplications and addend instructions from the given instruction // while traversing it operands. Additionally, verify that all instructions // have the same fast math flags. auto Collect = [&Flags](Instruction *Insn, std::vector &Muls, std::list &Addends) -> bool { SmallVector> Worklist = {{Insn, true}}; SmallPtrSet Visited; while (!Worklist.empty()) { auto [V, IsPositive] = Worklist.back(); Worklist.pop_back(); if (!Visited.insert(V).second) continue; Instruction *I = dyn_cast(V); if (!I) { Addends.emplace_back(V, IsPositive); continue; } // If an instruction has more than one user, it indicates that it either // has an external user, which will be later checked by the checkNodes // function, or it is a subexpression utilized by multiple expressions. In // the latter case, we will attempt to separately identify the complex // operation from here in order to create a shared // ComplexDeinterleavingCompositeNode. if (I != Insn && I->getNumUses() > 1) { LLVM_DEBUG(dbgs() << "Found potential sub-expression: " << *I << "\n"); Addends.emplace_back(I, IsPositive); continue; } switch (I->getOpcode()) { case Instruction::FAdd: case Instruction::Add: Worklist.emplace_back(I->getOperand(1), IsPositive); Worklist.emplace_back(I->getOperand(0), IsPositive); break; case Instruction::FSub: Worklist.emplace_back(I->getOperand(1), !IsPositive); Worklist.emplace_back(I->getOperand(0), IsPositive); break; case Instruction::Sub: if (isNeg(I)) { Worklist.emplace_back(getNegOperand(I), !IsPositive); } else { Worklist.emplace_back(I->getOperand(1), !IsPositive); Worklist.emplace_back(I->getOperand(0), IsPositive); } break; case Instruction::FMul: case Instruction::Mul: { Value *A, *B; if (isNeg(I->getOperand(0))) { A = getNegOperand(I->getOperand(0)); IsPositive = !IsPositive; } else { A = I->getOperand(0); } if (isNeg(I->getOperand(1))) { B = getNegOperand(I->getOperand(1)); IsPositive = !IsPositive; } else { B = I->getOperand(1); } Muls.push_back(Product{A, B, IsPositive}); break; } case Instruction::FNeg: Worklist.emplace_back(I->getOperand(0), !IsPositive); break; default: Addends.emplace_back(I, IsPositive); continue; } if (Flags && I->getFastMathFlags() != *Flags) { LLVM_DEBUG(dbgs() << "The instruction's fast math flags are " "inconsistent with the root instructions' flags: " << *I << "\n"); return false; } } return true; }; std::vector RealMuls, ImagMuls; std::list RealAddends, ImagAddends; if (!Collect(Real, RealMuls, RealAddends) || !Collect(Imag, ImagMuls, ImagAddends)) return nullptr; if (RealAddends.size() != ImagAddends.size()) return nullptr; NodePtr FinalNode; if (!RealMuls.empty() || !ImagMuls.empty()) { // If there are multiplicands, extract positive addend and use it as an // accumulator FinalNode = extractPositiveAddend(RealAddends, ImagAddends); FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode); if (!FinalNode) return nullptr; } // Identify and process remaining additions if (!RealAddends.empty() || !ImagAddends.empty()) { FinalNode = identifyAdditions(RealAddends, ImagAddends, Flags, FinalNode); if (!FinalNode) return nullptr; } assert(FinalNode && "FinalNode can not be nullptr here"); // Set the Real and Imag fields of the final node and submit it FinalNode->Real = Real; FinalNode->Imag = Imag; submitCompositeNode(FinalNode); return FinalNode; } bool ComplexDeinterleavingGraph::collectPartialMuls( const std::vector &RealMuls, const std::vector &ImagMuls, std::vector &PartialMulCandidates) { // Helper function to extract a common operand from two products auto FindCommonInstruction = [](const Product &Real, const Product &Imag) -> Value * { if (Real.Multiplicand == Imag.Multiplicand || Real.Multiplicand == Imag.Multiplier) return Real.Multiplicand; if (Real.Multiplier == Imag.Multiplicand || Real.Multiplier == Imag.Multiplier) return Real.Multiplier; return nullptr; }; // Iterating over real and imaginary multiplications to find common operands // If a common operand is found, a partial multiplication candidate is created // and added to the candidates vector The function returns false if no common // operands are found for any product for (unsigned i = 0; i < RealMuls.size(); ++i) { bool FoundCommon = false; for (unsigned j = 0; j < ImagMuls.size(); ++j) { auto *Common = FindCommonInstruction(RealMuls[i], ImagMuls[j]); if (!Common) continue; auto *A = RealMuls[i].Multiplicand == Common ? RealMuls[i].Multiplier : RealMuls[i].Multiplicand; auto *B = ImagMuls[j].Multiplicand == Common ? ImagMuls[j].Multiplier : ImagMuls[j].Multiplicand; auto Node = identifyNode(A, B); if (Node) { FoundCommon = true; PartialMulCandidates.push_back({Common, Node, i, j, false}); } Node = identifyNode(B, A); if (Node) { FoundCommon = true; PartialMulCandidates.push_back({Common, Node, i, j, true}); } } if (!FoundCommon) return false; } return true; } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyMultiplications( std::vector &RealMuls, std::vector &ImagMuls, NodePtr Accumulator = nullptr) { if (RealMuls.size() != ImagMuls.size()) return nullptr; std::vector Info; if (!collectPartialMuls(RealMuls, ImagMuls, Info)) return nullptr; // Map to store common instruction to node pointers std::map CommonToNode; std::vector Processed(Info.size(), false); for (unsigned I = 0; I < Info.size(); ++I) { if (Processed[I]) continue; PartialMulCandidate &InfoA = Info[I]; for (unsigned J = I + 1; J < Info.size(); ++J) { if (Processed[J]) continue; PartialMulCandidate &InfoB = Info[J]; auto *InfoReal = &InfoA; auto *InfoImag = &InfoB; auto NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common); if (!NodeFromCommon) { std::swap(InfoReal, InfoImag); NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common); } if (!NodeFromCommon) continue; CommonToNode[InfoReal->Common] = NodeFromCommon; CommonToNode[InfoImag->Common] = NodeFromCommon; Processed[I] = true; Processed[J] = true; } } std::vector ProcessedReal(RealMuls.size(), false); std::vector ProcessedImag(ImagMuls.size(), false); NodePtr Result = Accumulator; for (auto &PMI : Info) { if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx]) continue; auto It = CommonToNode.find(PMI.Common); // TODO: Process independent complex multiplications. Cases like this: // A.real() * B where both A and B are complex numbers. if (It == CommonToNode.end()) { LLVM_DEBUG({ dbgs() << "Unprocessed independent partial multiplication:\n"; for (auto *Mul : {&RealMuls[PMI.RealIdx], &RealMuls[PMI.RealIdx]}) dbgs().indent(4) << (Mul->IsPositive ? "+" : "-") << *Mul->Multiplier << " multiplied by " << *Mul->Multiplicand << "\n"; }); return nullptr; } auto &RealMul = RealMuls[PMI.RealIdx]; auto &ImagMul = ImagMuls[PMI.ImagIdx]; auto NodeA = It->second; auto NodeB = PMI.Node; auto IsMultiplicandReal = PMI.Common == NodeA->Real; // The following table illustrates the relationship between multiplications // and rotations. If we consider the multiplication (X + iY) * (U + iV), we // can see: // // Rotation | Real | Imag | // ---------+--------+--------+ // 0 | x * u | x * v | // 90 | -y * v | y * u | // 180 | -x * u | -x * v | // 270 | y * v | -y * u | // // Check if the candidate can indeed be represented by partial // multiplication // TODO: Add support for multiplication by complex one if ((IsMultiplicandReal && PMI.IsNodeInverted) || (!IsMultiplicandReal && !PMI.IsNodeInverted)) continue; // Determine the rotation based on the multiplications ComplexDeinterleavingRotation Rotation; if (IsMultiplicandReal) { // Detect 0 and 180 degrees rotation if (RealMul.IsPositive && ImagMul.IsPositive) Rotation = llvm::ComplexDeinterleavingRotation::Rotation_0; else if (!RealMul.IsPositive && !ImagMul.IsPositive) Rotation = llvm::ComplexDeinterleavingRotation::Rotation_180; else continue; } else { // Detect 90 and 270 degrees rotation if (!RealMul.IsPositive && ImagMul.IsPositive) Rotation = llvm::ComplexDeinterleavingRotation::Rotation_90; else if (RealMul.IsPositive && !ImagMul.IsPositive) Rotation = llvm::ComplexDeinterleavingRotation::Rotation_270; else continue; } LLVM_DEBUG({ dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n"; dbgs().indent(4) << "X: " << *NodeA->Real << "\n"; dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n"; dbgs().indent(4) << "U: " << *NodeB->Real << "\n"; dbgs().indent(4) << "V: " << *NodeB->Imag << "\n"; dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; }); NodePtr NodeMul = prepareCompositeNode( ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr); NodeMul->Rotation = Rotation; NodeMul->addOperand(NodeA); NodeMul->addOperand(NodeB); if (Result) NodeMul->addOperand(Result); submitCompositeNode(NodeMul); Result = NodeMul; ProcessedReal[PMI.RealIdx] = true; ProcessedImag[PMI.ImagIdx] = true; } // Ensure all products have been processed, if not return nullptr. if (!all_of(ProcessedReal, [](bool V) { return V; }) || !all_of(ProcessedImag, [](bool V) { return V; })) { // Dump debug information about which partial multiplications are not // processed. LLVM_DEBUG({ dbgs() << "Unprocessed products (Real):\n"; for (size_t i = 0; i < ProcessedReal.size(); ++i) { if (!ProcessedReal[i]) dbgs().indent(4) << (RealMuls[i].IsPositive ? "+" : "-") << *RealMuls[i].Multiplier << " multiplied by " << *RealMuls[i].Multiplicand << "\n"; } dbgs() << "Unprocessed products (Imag):\n"; for (size_t i = 0; i < ProcessedImag.size(); ++i) { if (!ProcessedImag[i]) dbgs().indent(4) << (ImagMuls[i].IsPositive ? "+" : "-") << *ImagMuls[i].Multiplier << " multiplied by " << *ImagMuls[i].Multiplicand << "\n"; } }); return nullptr; } return Result; } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyAdditions( std::list &RealAddends, std::list &ImagAddends, std::optional Flags, NodePtr Accumulator = nullptr) { if (RealAddends.size() != ImagAddends.size()) return nullptr; NodePtr Result; // If we have accumulator use it as first addend if (Accumulator) Result = Accumulator; // Otherwise find an element with both positive real and imaginary parts. else Result = extractPositiveAddend(RealAddends, ImagAddends); if (!Result) return nullptr; while (!RealAddends.empty()) { auto ItR = RealAddends.begin(); auto [R, IsPositiveR] = *ItR; bool FoundImag = false; for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { auto [I, IsPositiveI] = *ItI; ComplexDeinterleavingRotation Rotation; if (IsPositiveR && IsPositiveI) Rotation = ComplexDeinterleavingRotation::Rotation_0; else if (!IsPositiveR && IsPositiveI) Rotation = ComplexDeinterleavingRotation::Rotation_90; else if (!IsPositiveR && !IsPositiveI) Rotation = ComplexDeinterleavingRotation::Rotation_180; else Rotation = ComplexDeinterleavingRotation::Rotation_270; NodePtr AddNode; if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || Rotation == ComplexDeinterleavingRotation::Rotation_180) { AddNode = identifyNode(R, I); } else { AddNode = identifyNode(I, R); } if (AddNode) { LLVM_DEBUG({ dbgs() << "Identified addition:\n"; dbgs().indent(4) << "X: " << *R << "\n"; dbgs().indent(4) << "Y: " << *I << "\n"; dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; }); NodePtr TmpNode; if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) { TmpNode = prepareCompositeNode( ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); if (Flags) { TmpNode->Opcode = Instruction::FAdd; TmpNode->Flags = *Flags; } else { TmpNode->Opcode = Instruction::Add; } } else if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_180) { TmpNode = prepareCompositeNode( ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); if (Flags) { TmpNode->Opcode = Instruction::FSub; TmpNode->Flags = *Flags; } else { TmpNode->Opcode = Instruction::Sub; } } else { TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, nullptr, nullptr); TmpNode->Rotation = Rotation; } TmpNode->addOperand(Result); TmpNode->addOperand(AddNode); submitCompositeNode(TmpNode); Result = TmpNode; RealAddends.erase(ItR); ImagAddends.erase(ItI); FoundImag = true; break; } } if (!FoundImag) return nullptr; } return Result; } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::extractPositiveAddend( std::list &RealAddends, std::list &ImagAddends) { for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) { for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { auto [R, IsPositiveR] = *ItR; auto [I, IsPositiveI] = *ItI; if (IsPositiveR && IsPositiveI) { auto Result = identifyNode(R, I); if (Result) { RealAddends.erase(ItR); ImagAddends.erase(ItI); return Result; } } } } return nullptr; } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { // This potential root instruction might already have been recognized as // reduction. Because RootToNode maps both Real and Imaginary parts to // CompositeNode we should choose only one either Real or Imag instruction to // use as an anchor for generating complex instruction. auto It = RootToNode.find(RootI); - if (It != RootToNode.end() && It->second->Real == RootI) { + if (It != RootToNode.end()) { + auto RootNode = It->second; + assert(RootNode->Operation == + ComplexDeinterleavingOperation::ReductionOperation); + // Find out which part, Real or Imag, comes later, and only if we come to + // the latest part, add it to OrderedRoots. + auto *R = cast(RootNode->Real); + auto *I = cast(RootNode->Imag); + auto *ReplacementAnchor = R->comesBefore(I) ? I : R; + if (ReplacementAnchor != RootI) + return false; OrderedRoots.push_back(RootI); return true; } auto RootNode = identifyRoot(RootI); if (!RootNode) return false; LLVM_DEBUG({ Function *F = RootI->getFunction(); BasicBlock *B = RootI->getParent(); dbgs() << "Complex deinterleaving graph for " << F->getName() << "::" << B->getName() << ".\n"; dump(dbgs()); dbgs() << "\n"; }); RootToNode[RootI] = RootNode; OrderedRoots.push_back(RootI); return true; } bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { bool FoundPotentialReduction = false; auto *Br = dyn_cast(B->getTerminator()); if (!Br || Br->getNumSuccessors() != 2) return false; // Identify simple one-block loop if (Br->getSuccessor(0) != B && Br->getSuccessor(1) != B) return false; SmallVector PHIs; for (auto &PHI : B->phis()) { if (PHI.getNumIncomingValues() != 2) continue; if (!PHI.getType()->isVectorTy()) continue; auto *ReductionOp = dyn_cast(PHI.getIncomingValueForBlock(B)); if (!ReductionOp) continue; // Check if final instruction is reduced outside of current block Instruction *FinalReduction = nullptr; auto NumUsers = 0u; for (auto *U : ReductionOp->users()) { ++NumUsers; if (U == &PHI) continue; FinalReduction = dyn_cast(U); } if (NumUsers != 2 || !FinalReduction || FinalReduction->getParent() == B || isa(FinalReduction)) continue; ReductionInfo[ReductionOp] = {&PHI, FinalReduction}; BackEdge = B; auto BackEdgeIdx = PHI.getBasicBlockIndex(B); auto IncomingIdx = BackEdgeIdx == 0 ? 1 : 0; Incoming = PHI.getIncomingBlock(IncomingIdx); FoundPotentialReduction = true; // If the initial value of PHINode is an Instruction, consider it a leaf // value of a complex deinterleaving graph. if (auto *InitPHI = dyn_cast(PHI.getIncomingValueForBlock(Incoming))) FinalInstructions.insert(InitPHI); } return FoundPotentialReduction; } void ComplexDeinterleavingGraph::identifyReductionNodes() { SmallVector Processed(ReductionInfo.size(), false); SmallVector OperationInstruction; for (auto &P : ReductionInfo) OperationInstruction.push_back(P.first); // Identify a complex computation by evaluating two reduction operations that // potentially could be involved for (size_t i = 0; i < OperationInstruction.size(); ++i) { if (Processed[i]) continue; for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; auto *Real = OperationInstruction[i]; auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) continue; RealPHI = ReductionInfo[Real].first; ImagPHI = ReductionInfo[Imag].first; PHIsFound = false; auto Node = identifyNode(Real, Imag); if (!Node) { std::swap(Real, Imag); std::swap(RealPHI, ImagPHI); Node = identifyNode(Real, Imag); } // If a node is identified and reduction PHINode is used in the chain of // operations, mark its operation instructions as used to prevent // re-identification and attach the node to the real part if (Node && PHIsFound) { LLVM_DEBUG(dbgs() << "Identified reduction starting from instructions: " << *Real << " / " << *Imag << "\n"); Processed[i] = true; Processed[j] = true; auto RootNode = prepareCompositeNode( ComplexDeinterleavingOperation::ReductionOperation, Real, Imag); RootNode->addOperand(Node); RootToNode[Real] = RootNode; RootToNode[Imag] = RootNode; submitCompositeNode(RootNode); break; } } } RealPHI = nullptr; ImagPHI = nullptr; } bool ComplexDeinterleavingGraph::checkNodes() { // Collect all instructions from roots to leaves SmallPtrSet AllInstructions; SmallVector Worklist; for (auto &Pair : RootToNode) Worklist.push_back(Pair.first); // Extract all instructions that are used by all XCMLA/XCADD/ADD/SUB/NEG // chains while (!Worklist.empty()) { auto *I = Worklist.back(); Worklist.pop_back(); if (!AllInstructions.insert(I).second) continue; for (Value *Op : I->operands()) { if (auto *OpI = dyn_cast(Op)) { if (!FinalInstructions.count(I)) Worklist.emplace_back(OpI); } } } // Find instructions that have users outside of chain SmallVector OuterInstructions; for (auto *I : AllInstructions) { // Skip root nodes if (RootToNode.count(I)) continue; for (User *U : I->users()) { if (AllInstructions.count(cast(U))) continue; // Found an instruction that is not used by XCMLA/XCADD chain Worklist.emplace_back(I); break; } } // If any instructions are found to be used outside, find and remove roots // that somehow connect to those instructions. SmallPtrSet Visited; while (!Worklist.empty()) { auto *I = Worklist.back(); Worklist.pop_back(); if (!Visited.insert(I).second) continue; // Found an impacted root node. Removing it from the nodes to be // deinterleaved if (RootToNode.count(I)) { LLVM_DEBUG(dbgs() << "Instruction " << *I << " could be deinterleaved but its chain of complex " "operations have an outside user\n"); RootToNode.erase(I); } if (!AllInstructions.count(I) || FinalInstructions.count(I)) continue; for (User *U : I->users()) Worklist.emplace_back(cast(U)); for (Value *Op : I->operands()) { if (auto *OpI = dyn_cast(Op)) Worklist.emplace_back(OpI); } } return !RootToNode.empty(); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) { if (auto *Intrinsic = dyn_cast(RootI)) { if (Intrinsic->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) return nullptr; auto *Real = dyn_cast(Intrinsic->getOperand(0)); auto *Imag = dyn_cast(Intrinsic->getOperand(1)); if (!Real || !Imag) return nullptr; return identifyNode(Real, Imag); } auto *SVI = dyn_cast(RootI); if (!SVI) return nullptr; // Look for a shufflevector that takes separate vectors of the real and // imaginary components and recombines them into a single vector. if (!isInterleavingMask(SVI->getShuffleMask())) return nullptr; Instruction *Real; Instruction *Imag; if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) return nullptr; return identifyNode(Real, Imag); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real, Instruction *Imag) { Instruction *I = nullptr; Value *FinalValue = nullptr; if (match(Real, m_ExtractValue<0>(m_Instruction(I))) && match(Imag, m_ExtractValue<1>(m_Specific(I))) && match(I, m_Intrinsic( m_Value(FinalValue)))) { NodePtr PlaceholderNode = prepareCompositeNode( llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag); PlaceholderNode->ReplacementNode = FinalValue; FinalInstructions.insert(Real); FinalInstructions.insert(Imag); return submitCompositeNode(PlaceholderNode); } auto *RealShuffle = dyn_cast(Real); auto *ImagShuffle = dyn_cast(Imag); if (!RealShuffle || !ImagShuffle) { if (RealShuffle || ImagShuffle) LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n"); return nullptr; } Value *RealOp1 = RealShuffle->getOperand(1); if (!isa(RealOp1) && !isa(RealOp1)) { LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); return nullptr; } Value *ImagOp1 = ImagShuffle->getOperand(1); if (!isa(ImagOp1) && !isa(ImagOp1)) { LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); return nullptr; } Value *RealOp0 = RealShuffle->getOperand(0); Value *ImagOp0 = ImagShuffle->getOperand(0); if (RealOp0 != ImagOp0) { LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); return nullptr; } ArrayRef RealMask = RealShuffle->getShuffleMask(); ArrayRef ImagMask = ImagShuffle->getShuffleMask(); if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); return nullptr; } if (RealMask[0] != 0 || ImagMask[0] != 1) { LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); return nullptr; } // Type checking, the shuffle type should be a vector type of the same // scalar type, but half the size auto CheckType = [&](ShuffleVectorInst *Shuffle) { Value *Op = Shuffle->getOperand(0); auto *ShuffleTy = cast(Shuffle->getType()); auto *OpTy = cast(Op->getType()); if (OpTy->getScalarType() != ShuffleTy->getScalarType()) return false; if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) return false; return true; }; auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { if (!CheckType(Shuffle)) return false; ArrayRef Mask = Shuffle->getShuffleMask(); int Last = *Mask.rbegin(); Value *Op = Shuffle->getOperand(0); auto *OpTy = cast(Op->getType()); int NumElements = OpTy->getNumElements(); // Ensure that the deinterleaving shuffle only pulls from the first // shuffle operand. return Last < NumElements; }; if (RealShuffle->getType() != ImagShuffle->getType()) { LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); return nullptr; } if (!CheckDeinterleavingShuffle(RealShuffle)) { LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); return nullptr; } if (!CheckDeinterleavingShuffle(ImagShuffle)) { LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); return nullptr; } NodePtr PlaceholderNode = prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Deinterleave, RealShuffle, ImagShuffle); PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); FinalInstructions.insert(RealShuffle); FinalInstructions.insert(ImagShuffle); return submitCompositeNode(PlaceholderNode); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { auto IsSplat = [](Value *V) -> bool { // Fixed-width vector with constants if (isa(V)) return true; VectorType *VTy; ArrayRef Mask; // Splats are represented differently depending on whether the repeated // value is a constant or an Instruction if (auto *Const = dyn_cast(V)) { if (Const->getOpcode() != Instruction::ShuffleVector) return false; VTy = cast(Const->getType()); Mask = Const->getShuffleMask(); } else if (auto *Shuf = dyn_cast(V)) { VTy = Shuf->getType(); Mask = Shuf->getShuffleMask(); } else { return false; } // When the data type is <1 x Type>, it's not possible to differentiate // between the ComplexDeinterleaving::Deinterleave and // ComplexDeinterleaving::Splat operations. if (!VTy->isScalableTy() && VTy->getElementCount().getKnownMinValue() == 1) return false; return all_equal(Mask) && Mask[0] == 0; }; if (!IsSplat(R) || !IsSplat(I)) return nullptr; auto *Real = dyn_cast(R); auto *Imag = dyn_cast(I); if ((!Real && Imag) || (Real && !Imag)) return nullptr; if (Real && Imag) { // Non-constant splats should be in the same basic block if (Real->getParent() != Imag->getParent()) return nullptr; FinalInstructions.insert(Real); FinalInstructions.insert(Imag); } NodePtr PlaceholderNode = prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I); return submitCompositeNode(PlaceholderNode); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, Instruction *Imag) { if (Real != RealPHI || Imag != ImagPHI) return nullptr; PHIsFound = true; NodePtr PlaceholderNode = prepareCompositeNode( ComplexDeinterleavingOperation::ReductionPHI, Real, Imag); return submitCompositeNode(PlaceholderNode); } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifySelectNode(Instruction *Real, Instruction *Imag) { auto *SelectReal = dyn_cast(Real); auto *SelectImag = dyn_cast(Imag); if (!SelectReal || !SelectImag) return nullptr; Instruction *MaskA, *MaskB; Instruction *AR, *AI, *RA, *BI; if (!match(Real, m_Select(m_Instruction(MaskA), m_Instruction(AR), m_Instruction(RA))) || !match(Imag, m_Select(m_Instruction(MaskB), m_Instruction(AI), m_Instruction(BI)))) return nullptr; if (MaskA != MaskB && !MaskA->isIdenticalTo(MaskB)) return nullptr; if (!MaskA->getType()->isVectorTy()) return nullptr; auto NodeA = identifyNode(AR, AI); if (!NodeA) return nullptr; auto NodeB = identifyNode(RA, BI); if (!NodeB) return nullptr; NodePtr PlaceholderNode = prepareCompositeNode( ComplexDeinterleavingOperation::ReductionSelect, Real, Imag); PlaceholderNode->addOperand(NodeA); PlaceholderNode->addOperand(NodeB); FinalInstructions.insert(MaskA); FinalInstructions.insert(MaskB); return submitCompositeNode(PlaceholderNode); } static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode, std::optional Flags, Value *InputA, Value *InputB) { Value *I; switch (Opcode) { case Instruction::FNeg: I = B.CreateFNeg(InputA); break; case Instruction::FAdd: I = B.CreateFAdd(InputA, InputB); break; case Instruction::Add: I = B.CreateAdd(InputA, InputB); break; case Instruction::FSub: I = B.CreateFSub(InputA, InputB); break; case Instruction::Sub: I = B.CreateSub(InputA, InputB); break; case Instruction::FMul: I = B.CreateFMul(InputA, InputB); break; case Instruction::Mul: I = B.CreateMul(InputA, InputB); break; default: llvm_unreachable("Incorrect symmetric opcode"); } if (Flags) cast(I)->setFastMathFlags(*Flags); return I; } Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, RawNodePtr Node) { if (Node->ReplacementNode) return Node->ReplacementNode; auto ReplaceOperandIfExist = [&](RawNodePtr &Node, unsigned Idx) -> Value * { return Node->Operands.size() > Idx ? replaceNode(Builder, Node->Operands[Idx]) : nullptr; }; Value *ReplacementNode; switch (Node->Operation) { case ComplexDeinterleavingOperation::CAdd: case ComplexDeinterleavingOperation::CMulPartial: case ComplexDeinterleavingOperation::Symmetric: { Value *Input0 = ReplaceOperandIfExist(Node, 0); Value *Input1 = ReplaceOperandIfExist(Node, 1); Value *Accumulator = ReplaceOperandIfExist(Node, 2); assert(!Input1 || (Input0->getType() == Input1->getType() && "Node inputs need to be of the same type")); assert(!Accumulator || (Input0->getType() == Accumulator->getType() && "Accumulator and input need to be of the same type")); if (Node->Operation == ComplexDeinterleavingOperation::Symmetric) ReplacementNode = replaceSymmetricNode(Builder, Node->Opcode, Node->Flags, Input0, Input1); else ReplacementNode = TL->createComplexDeinterleavingIR( Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); break; } case ComplexDeinterleavingOperation::Deinterleave: llvm_unreachable("Deinterleave node should already have ReplacementNode"); break; case ComplexDeinterleavingOperation::Splat: { auto *NewTy = VectorType::getDoubleElementsVectorType( cast(Node->Real->getType())); auto *R = dyn_cast(Node->Real); auto *I = dyn_cast(Node->Imag); if (R && I) { // Splats that are not constant are interleaved where they are located Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); IRBuilder<> IRB(InsertPoint); ReplacementNode = IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy, {Node->Real, Node->Imag}); } else { ReplacementNode = Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy, {Node->Real, Node->Imag}); } break; } case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. auto *VTy = cast(Node->Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHI()); OldToNewPHI[dyn_cast(Node->Real)] = NewPHI; ReplacementNode = NewPHI; break; } case ComplexDeinterleavingOperation::ReductionOperation: ReplacementNode = replaceNode(Builder, Node->Operands[0]); processReductionOperation(ReplacementNode, Node); break; case ComplexDeinterleavingOperation::ReductionSelect: { auto *MaskReal = cast(Node->Real)->getOperand(0); auto *MaskImag = cast(Node->Imag)->getOperand(0); auto *A = replaceNode(Builder, Node->Operands[0]); auto *B = replaceNode(Builder, Node->Operands[1]); auto *NewMaskTy = VectorType::getDoubleElementsVectorType( cast(MaskReal->getType())); auto *NewMask = Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewMaskTy, {MaskReal, MaskImag}); ReplacementNode = Builder.CreateSelect(NewMask, A, B); break; } } assert(ReplacementNode && "Target failed to create Intrinsic call."); NumComplexTransformations += 1; Node->ReplacementNode = ReplacementNode; return ReplacementNode; } void ComplexDeinterleavingGraph::processReductionOperation( Value *OperationReplacement, RawNodePtr Node) { auto *Real = cast(Node->Real); auto *Imag = cast(Node->Imag); auto *OldPHIReal = ReductionInfo[Real].first; auto *OldPHIImag = ReductionInfo[Imag].first; auto *NewPHI = OldToNewPHI[OldPHIReal]; auto *VTy = cast(Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); // We have to interleave initial origin values coming from IncomingBlock Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming); Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming); IRBuilder<> Builder(Incoming->getTerminator()); auto *NewInit = Builder.CreateIntrinsic( Intrinsic::experimental_vector_interleave2, NewVTy, {InitReal, InitImag}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); // Deinterleave complex vector outside of loop so that it can be finally // reduced auto *FinalReductionReal = ReductionInfo[Real].second; auto *FinalReductionImag = ReductionInfo[Imag].second; Builder.SetInsertPoint( &*FinalReductionReal->getParent()->getFirstInsertionPt()); auto *Deinterleave = Builder.CreateIntrinsic( Intrinsic::experimental_vector_deinterleave2, OperationReplacement->getType(), OperationReplacement); auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0); FinalReductionReal->replaceUsesOfWith(Real, NewReal); Builder.SetInsertPoint(FinalReductionImag); auto *NewImag = Builder.CreateExtractValue(Deinterleave, 1); FinalReductionImag->replaceUsesOfWith(Imag, NewImag); } void ComplexDeinterleavingGraph::replaceNodes() { SmallVector DeadInstrRoots; for (auto *RootInstruction : OrderedRoots) { // Check if this potential root went through check process and we can // deinterleave it if (!RootToNode.count(RootInstruction)) continue; IRBuilder<> Builder(RootInstruction); auto RootNode = RootToNode[RootInstruction]; Value *R = replaceNode(Builder, RootNode.get()); if (RootNode->Operation == ComplexDeinterleavingOperation::ReductionOperation) { auto *RootReal = cast(RootNode->Real); auto *RootImag = cast(RootNode->Imag); ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); DeadInstrRoots.push_back(cast(RootReal)); DeadInstrRoots.push_back(cast(RootImag)); } else { assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); RootInstruction->replaceAllUsesWith(R); } } for (auto *I : DeadInstrRoots) RecursivelyDeleteTriviallyDeadInstructions(I, TLI); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0605dfa63793..c7a6dd7deb45 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1,26012 +1,26046 @@ //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the AArch64TargetLowering class. // //===----------------------------------------------------------------------===// #include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64ExpandImm.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/InstructionCost.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/TargetParser/Triple.h" #include #include #include #include #include #include #include #include #include #include #include #include using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( "aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); static cl::opt EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true)); // Temporary option added for the purpose of testing functionality added // to DAGCombiner.cpp in D92230. It is expected that this can be removed // in future when both implementations will be based off MGATHER rather // than the GLD1 nodes added for the SVE gather load intrinsics. static cl::opt EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true)); // All of the XOR, OR and CMP use ALU ports, and data dependency will become the // bottleneck after this transform on high end CPU. So this max leaf node // limitation is guard cmp+ccmp will be profitable. static cl::opt MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors")); /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7}; static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; const ArrayRef llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; } const ArrayRef llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; } static inline EVT getPackedSVEVectorVT(EVT VT) { switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unexpected element type for vector"); case MVT::i8: return MVT::nxv16i8; case MVT::i16: return MVT::nxv8i16; case MVT::i32: return MVT::nxv4i32; case MVT::i64: return MVT::nxv2i64; case MVT::f16: return MVT::nxv8f16; case MVT::f32: return MVT::nxv4f32; case MVT::f64: return MVT::nxv2f64; case MVT::bf16: return MVT::nxv8bf16; } } // NOTE: Currently there's only a need to return integer vector types. If this // changes then just add an extra "type" parameter. static inline EVT getPackedSVEVectorVT(ElementCount EC) { switch (EC.getKnownMinValue()) { default: llvm_unreachable("unexpected element count for vector"); case 16: return MVT::nxv16i8; case 8: return MVT::nxv8i16; case 4: return MVT::nxv4i32; case 2: return MVT::nxv2i64; } } static inline EVT getPromotedVTForPredicate(EVT VT) { assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && "Expected scalable predicate vector type!"); switch (VT.getVectorMinNumElements()) { default: llvm_unreachable("unexpected element count for vector"); case 2: return MVT::nxv2i64; case 4: return MVT::nxv4i32; case 8: return MVT::nxv8i16; case 16: return MVT::nxv16i8; } } /// Returns true if VT's elements occupy the lowest bit positions of its /// associated register class without any intervening space. /// /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the /// same register class, but only nxv8f16 can be treated as a packed vector. static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Expected legal vector type!"); return VT.isFixedLengthVector() || VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock; } // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading // predicate and end with a passthru value matching the result type. static bool isMergePassthruOpcode(unsigned Opc) { switch (Opc) { default: return false; case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: case AArch64ISD::BSWAP_MERGE_PASSTHRU: case AArch64ISD::REVH_MERGE_PASSTHRU: case AArch64ISD::REVW_MERGE_PASSTHRU: case AArch64ISD::REVD_MERGE_PASSTHRU: case AArch64ISD::CTLZ_MERGE_PASSTHRU: case AArch64ISD::CTPOP_MERGE_PASSTHRU: case AArch64ISD::DUP_MERGE_PASSTHRU: case AArch64ISD::ABS_MERGE_PASSTHRU: case AArch64ISD::NEG_MERGE_PASSTHRU: case AArch64ISD::FNEG_MERGE_PASSTHRU: case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: case AArch64ISD::FCEIL_MERGE_PASSTHRU: case AArch64ISD::FFLOOR_MERGE_PASSTHRU: case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: case AArch64ISD::FRINT_MERGE_PASSTHRU: case AArch64ISD::FROUND_MERGE_PASSTHRU: case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: case AArch64ISD::FTRUNC_MERGE_PASSTHRU: case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: case AArch64ISD::FCVTZU_MERGE_PASSTHRU: case AArch64ISD::FCVTZS_MERGE_PASSTHRU: case AArch64ISD::FSQRT_MERGE_PASSTHRU: case AArch64ISD::FRECPX_MERGE_PASSTHRU: case AArch64ISD::FABS_MERGE_PASSTHRU: return true; } } // Returns true if inactive lanes are known to be zeroed by construction. static bool isZeroingInactiveLanes(SDValue Op) { switch (Op.getOpcode()) { default: // We guarantee i1 splat_vectors to zero the other lanes by // implementing it with ptrue and possibly a punpklo for nxv1i1. if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) return true; return false; case AArch64ISD::PTRUE: case AArch64ISD::SETCC_MERGE_ZERO: return true; case ISD::INTRINSIC_WO_CHAIN: switch (Op.getConstantOperandVal(0)) { default: return false; case Intrinsic::aarch64_sve_ptrue: case Intrinsic::aarch64_sve_pnext: case Intrinsic::aarch64_sve_cmpeq: case Intrinsic::aarch64_sve_cmpne: case Intrinsic::aarch64_sve_cmpge: case Intrinsic::aarch64_sve_cmpgt: case Intrinsic::aarch64_sve_cmphs: case Intrinsic::aarch64_sve_cmphi: case Intrinsic::aarch64_sve_cmpeq_wide: case Intrinsic::aarch64_sve_cmpne_wide: case Intrinsic::aarch64_sve_cmpge_wide: case Intrinsic::aarch64_sve_cmpgt_wide: case Intrinsic::aarch64_sve_cmplt_wide: case Intrinsic::aarch64_sve_cmple_wide: case Intrinsic::aarch64_sve_cmphs_wide: case Intrinsic::aarch64_sve_cmphi_wide: case Intrinsic::aarch64_sve_cmplo_wide: case Intrinsic::aarch64_sve_cmpls_wide: case Intrinsic::aarch64_sve_fcmpeq: case Intrinsic::aarch64_sve_fcmpne: case Intrinsic::aarch64_sve_fcmpge: case Intrinsic::aarch64_sve_fcmpgt: case Intrinsic::aarch64_sve_fcmpuo: case Intrinsic::aarch64_sve_facgt: case Intrinsic::aarch64_sve_facge: case Intrinsic::aarch64_sve_whilege: case Intrinsic::aarch64_sve_whilegt: case Intrinsic::aarch64_sve_whilehi: case Intrinsic::aarch64_sve_whilehs: case Intrinsic::aarch64_sve_whilele: case Intrinsic::aarch64_sve_whilelo: case Intrinsic::aarch64_sve_whilels: case Intrinsic::aarch64_sve_whilelt: case Intrinsic::aarch64_sve_match: case Intrinsic::aarch64_sve_nmatch: case Intrinsic::aarch64_sve_whilege_x2: case Intrinsic::aarch64_sve_whilegt_x2: case Intrinsic::aarch64_sve_whilehi_x2: case Intrinsic::aarch64_sve_whilehs_x2: case Intrinsic::aarch64_sve_whilele_x2: case Intrinsic::aarch64_sve_whilelo_x2: case Intrinsic::aarch64_sve_whilels_x2: case Intrinsic::aarch64_sve_whilelt_x2: return true; } } } AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. setBooleanContents(ZeroOrOneBooleanContent); // When comparing vectors the result sets the different elements in the // vector to all-one or all-zero. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Set up the register classes. addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); if (Subtarget->hasLS64()) { addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); setOperationAction(ISD::LOAD, MVT::i64x8, Custom); setOperationAction(ISD::STORE, MVT::i64x8, Custom); } if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); } if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); // Someone set us up the NEON. addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); if (Subtarget->hasBF16()) addDRTypeForNEON(MVT::v4bf16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); if (Subtarget->hasBF16()) addQRTypeForNEON(MVT::v8bf16); } if (Subtarget->hasSVEorSME()) { // Add legal sve predicate types addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); // Add legal sve data types addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); if (Subtarget->hasBF16()) { addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); } if (Subtarget->useSVEForFixedLengthVectors()) { for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) if (useSVEForFixedLengthVectorVT(VT)) addRegisterClass(VT, &AArch64::ZPRRegClass); for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) if (useSVEForFixedLengthVectorVT(VT)) addRegisterClass(VT, &AArch64::ZPRRegClass); } } if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass); setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1); setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1); setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom); setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand); } // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); setOperationAction(ISD::BR_CC, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT, MVT::bf16, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Custom); setOperationAction(ISD::JumpTable, MVT::i64, Custom); setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); setOperationAction(ISD::FREM, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f80, Expand); setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // Custom lowering hooks are needed for XOR // to fold it into CSINC/CSINV. setOperationAction(ISD::XOR, MVT::i32, Custom); setOperationAction(ISD::XOR, MVT::i64, Custom); // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, Expand); setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); setOperationAction(ISD::FRINT, MVT::f128, Expand); setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently // aren't handled. // Lowering for many of the conversions is actually specified by the non-f128 // type. The LowerXXX function will be trivial when f128 isn't involved. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Custom); setOperationAction(ISD::VAEND, MVT::Other, Expand); // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); // BlockAddress setOperationAction(ISD::BlockAddress, MVT::i64, Custom); // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } // AArch64 doesn't have i32 MULH{S|U}. setOperationAction(ISD::MULHU, MVT::i32, Expand); setOperationAction(ISD::MULHS, MVT::i32, Expand); // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); if (Subtarget->hasCSSC()) { setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); setOperationAction(ISD::CTPOP, MVT::i128, Expand); setOperationAction(ISD::PARITY, MVT::i128, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Legal); setOperationAction(ISD::CTTZ, MVT::i64, Legal); setOperationAction(ISD::CTTZ, MVT::i128, Expand); setOperationAction(ISD::ABS, MVT::i32, Legal); setOperationAction(ISD::ABS, MVT::i64, Legal); setOperationAction(ISD::SMAX, MVT::i32, Legal); setOperationAction(ISD::SMAX, MVT::i64, Legal); setOperationAction(ISD::UMAX, MVT::i32, Legal); setOperationAction(ISD::UMAX, MVT::i64, Legal); setOperationAction(ISD::SMIN, MVT::i32, Legal); setOperationAction(ISD::SMIN, MVT::i64, Legal); setOperationAction(ISD::UMIN, MVT::i32, Legal); setOperationAction(ISD::UMIN, MVT::i64, Legal); } else { setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::CTPOP, MVT::i128, Custom); setOperationAction(ISD::PARITY, MVT::i64, Custom); setOperationAction(ISD::PARITY, MVT::i128, Custom); setOperationAction(ISD::ABS, MVT::i32, Custom); setOperationAction(ISD::ABS, MVT::i64, Custom); } setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); // Custom lower Add/Sub/Mul with overflow. setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::SADDO, MVT::i64, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i64, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i64, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i64, Custom); setOperationAction(ISD::SMULO, MVT::i32, Custom); setOperationAction(ISD::SMULO, MVT::i64, Custom); setOperationAction(ISD::UMULO, MVT::i32, Custom); setOperationAction(ISD::UMULO, MVT::i64, Custom); setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom); setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom); setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom); setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom); setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom); setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom); setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom); setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); if (Subtarget->hasFullFP16()) setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); else setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, ISD::FCOS, ISD::FSIN, ISD::FSINCOS, ISD::FEXP, ISD::FEXP2, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::v4f16, Expand); setOperationAction(Op, MVT::v8f16, Expand); } if (!Subtarget->hasFullFP16()) { for (auto Op : {ISD::SETCC, ISD::SELECT_CC, ISD::BR_CC, ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, ISD::FMA, ISD::FNEG, ISD::FABS, ISD::FCEIL, ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM}) setOperationAction(Op, MVT::f16, Promote); // Round-to-integer need custom lowering for fp16, as Promote doesn't work // because the result type is integer. for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) setOperationAction(Op, MVT::f16, Custom); // promote v4f16 to v4f32 when that is known to be safe. setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); setOperationAction(ISD::FABS, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); setOperationAction(ISD::FROUND, MVT::v4f16, Expand); setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand); setOperationAction(ISD::FMA, MVT::v4f16, Expand); setOperationAction(ISD::SETCC, MVT::v4f16, Custom); setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); setOperationAction(ISD::SELECT, MVT::v4f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); setOperationAction(ISD::FRINT, MVT::v4f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); setOperationAction(ISD::FABS, MVT::v8f16, Expand); setOperationAction(ISD::FADD, MVT::v8f16, Expand); setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); setOperationAction(ISD::FDIV, MVT::v8f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); setOperationAction(ISD::FMA, MVT::v8f16, Expand); setOperationAction(ISD::FMUL, MVT::v8f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Expand); setOperationAction(ISD::FROUND, MVT::v8f16, Expand); setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand); setOperationAction(ISD::FRINT, MVT::v8f16, Expand); setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); setOperationAction(ISD::FSUB, MVT::v8f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); setOperationAction(ISD::SETCC, MVT::v8f16, Expand); setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); setOperationAction(ISD::SELECT, MVT::v8f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); } // AArch64 has implementations of a lot of rounding-like FP operations. for (auto Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::FROUND, ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) { for (MVT Ty : {MVT::f32, MVT::f64}) setOperationAction(Op, Ty, Legal); if (Subtarget->hasFullFP16()) setOperationAction(Op, MVT::f16, Legal); } // Basic strict FP operations are legal for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) { for (MVT Ty : {MVT::f32, MVT::f64}) setOperationAction(Op, Ty, Legal); if (Subtarget->hasFullFP16()) setOperationAction(Op, MVT::f16, Legal); } // Strict conversion to a larger type is legal for (auto VT : {MVT::f32, MVT::f64}) setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); // Generate outline atomics library calls only if LSE was not specified for // subtarget if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); #define LCALLNAMES(A, B, N) \ setLibcallName(A##N##_RELAX, #B #N "_relax"); \ setLibcallName(A##N##_ACQ, #B #N "_acq"); \ setLibcallName(A##N##_REL, #B #N "_rel"); \ setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); #define LCALLNAME4(A, B) \ LCALLNAMES(A, B, 1) \ LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) #define LCALLNAME5(A, B) \ LCALLNAMES(A, B, 1) \ LCALLNAMES(A, B, 2) \ LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) #undef LCALLNAMES #undef LCALLNAME4 #undef LCALLNAME5 } if (Subtarget->hasLSE128()) { // Custom lowering because i128 is not legal. Must be replaced by 2x64 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP. setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom); } // 128-bit loads and stores can be done without expanding setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); // Aligned 128-bit loads and stores are single-copy atomic according to the // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2. if (Subtarget->hasLSE2()) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); } // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the // custom lowering, as there are no un-paired non-temporal stores and // legalization will break up 256 bit inputs. setOperationAction(ISD::STORE, MVT::v32i8, Custom); setOperationAction(ISD::STORE, MVT::v16i16, Custom); setOperationAction(ISD::STORE, MVT::v16f16, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v8f32, Custom); setOperationAction(ISD::STORE, MVT::v4f64, Custom); setOperationAction(ISD::STORE, MVT::v4i64, Custom); // 256 bit non-temporal loads can be lowered to LDNP. This is done using // custom lowering, as there are no un-paired non-temporal loads legalization // will break up 256 bit inputs. setOperationAction(ISD::LOAD, MVT::v32i8, Custom); setOperationAction(ISD::LOAD, MVT::v16i16, Custom); setOperationAction(ISD::LOAD, MVT::v16f16, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v8f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f64, Custom); setOperationAction(ISD::LOAD, MVT::v4i64, Custom); // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { // Issue __sincos_stret if available. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } else { setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } if (Subtarget->getTargetTriple().isOSMSVCRT()) { // MSVCRT doesn't have powi; fall back to pow setLibcallName(RTLIB::POWI_F32, nullptr); setLibcallName(RTLIB::POWI_F64, nullptr); } // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { setOperationAction(ISD::ConstantFP, MVT::f32, Legal); setOperationAction(ISD::ConstantFP, MVT::f64, Legal); } // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); } for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); setOperationAction(ISD::BITCAST, MVT::bf16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); setIndexedLoadAction(im, MVT::bf16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::bf16, Legal); } // Trap. setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); // Try to create BICs for vector ANDs. setTargetDAGCombine(ISD::AND); // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP, ISD::UINT_TO_FP}); setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV}); // Try and combine setcc with csel setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, ISD::VECREDUCE_ADD, ISD::STEP_VECTOR}); setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER}); setTargetDAGCombine(ISD::FP_EXTEND); setTargetDAGCombine(ISD::GlobalAddress); setTargetDAGCombine(ISD::CTLZ); setTargetDAGCombine(ISD::VECREDUCE_AND); setTargetDAGCombine(ISD::VECREDUCE_OR); setTargetDAGCombine(ISD::VECREDUCE_XOR); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; MaxGluedStoresPerMemcpy = 4; MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; MaxStoresPerMemmoveOptSize = 4; MaxStoresPerMemmove = 4; MaxLoadsPerMemcmpOptSize = 4; MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8; setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); EnableExtLdPromotion = true; // Set required alignment. setMinFunctionAlignment(Align(4)); // Set preferred alignments. setPrefLoopAlignment(STI.getPrefLoopAlignment()); setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment()); setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); // Only change the limit for entries in a jump table if specified by // the sub target, but not at the command line. unsigned MaxJT = STI.getMaximumJumpTableSize(); if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) setMaximumJumpTableSize(MaxJT); setHasExtractBitsInsn(true); setMaxDivRemBitWidthSupported(128); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: for (auto Op : {ISD::SELECT, ISD::SELECT_CC, ISD::BR_CC, ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, ISD::FMA, ISD::FNEG, ISD::FABS, ISD::FCEIL, ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM}) setOperationAction(Op, MVT::v1f64, Expand); for (auto Op : {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND}) setOperationAction(Op, MVT::v1i64, Expand); // AArch64 doesn't have a direct vector ->f32 conversion instructions for // elements smaller than i32, so promote the input to i32 first. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); // Similarly, there is no direct i32 -> f64 vector conversion instruction. // Or, direct i32 -> f16 vector conversion. Set it so custom, so the // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP}) for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32}) setOperationAction(Op, VT, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::ConstantFP, MVT::f16, Legal); setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); } else { // when AArch64 doesn't have fullfp16 support, promote the input // to i32 first. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); } setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); for (auto VT : {MVT::v1i64, MVT::v2i64}) { setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::SMAX, VT, Custom); setOperationAction(ISD::UMIN, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); } // Custom handling for some quad-vector types to detect MULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MUL, MVT::v4i16, Custom); setOperationAction(ISD::MUL, MVT::v2i32, Custom); setOperationAction(ISD::MUL, MVT::v1i64, Custom); // Saturates for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); } for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32}) { setOperationAction(ISD::AVGFLOORS, VT, Legal); setOperationAction(ISD::AVGFLOORU, VT, Legal); setOperationAction(ISD::AVGCEILS, VT, Legal); setOperationAction(ISD::AVGCEILU, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); } // Vector reductions for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal); setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal); setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); } } for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); } setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { setOperationAction(ISD::MULHS, VT, Legal); setOperationAction(ISD::MULHU, VT, Legal); } else { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); } setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // AArch64 has implementations of a lot of rounding-like FP operations. for (auto Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR, ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) { for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) setOperationAction(Op, Ty, Legal); if (Subtarget->hasFullFP16()) for (MVT Ty : {MVT::v4f16, MVT::v8f16}) setOperationAction(Op, Ty, Legal); } setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); setOperationAction(ISD::BITCAST, MVT::i2, Custom); setOperationAction(ISD::BITCAST, MVT::i4, Custom); setOperationAction(ISD::BITCAST, MVT::i8, Custom); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i16, Custom); setOperationAction(ISD::BITCAST, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); // ADDP custom lowering for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::ADD, VT, Custom); // FADDP custom lowering for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::FADD, VT, Custom); } if (Subtarget->hasSME()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); } // FIXME: Move lowering for more nodes here if those are common between // SVE and SME. if (Subtarget->hasSVEorSME()) { for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); } } if (Subtarget->hasSVE()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::BSWAP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); setOperationAction(ISD::UMIN, VT, Custom); setOperationAction(ISD::SMAX, VT, Custom); setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::AVGFLOORS, VT, Custom); setOperationAction(ISD::AVGFLOORU, VT, Custom); setOperationAction(ISD::AVGCEILS, VT, Custom); setOperationAction(ISD::AVGCEILU, VT, Custom); } // Illegal unpacked integer vector types. for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } // Legalize unpacked bitcasts to REINTERPRET_CAST. for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) setOperationAction(ISD::BITCAST, VT, Custom); for (auto VT : { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); // There are no legal MVT::nxv16f## based types. if (VT != MVT::nxv16i1) { setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); } } // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } // Firstly, exclude all scalable vector extending loads/truncating stores, // include both integer and floating scalable vector. for (MVT VT : MVT::scalable_vector_valuetypes()) { for (MVT InnerVT : MVT::scalable_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // Then, selectively enable those which we directly support. setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal); setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal); setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal); setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal); setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal); setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal); for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal); setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal); setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal); setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal); setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal); setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal); } // SVE supports truncating stores of 64 and 128-bit vectors setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FDIV, VT, Custom); setOperationAction(ISD::FMA, VT, Custom); setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMAXNUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMINNUM, VT, Custom); setOperationAction(ISD::FMUL, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); setOperationAction(ISD::FCEIL, VT, Custom); setOperationAction(ISD::FFLOOR, VT, Custom); setOperationAction(ISD::FNEARBYINT, VT, Custom); setOperationAction(ISD::FRINT, VT, Custom); setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::FROUNDEVEN, VT, Custom); setOperationAction(ISD::FTRUNC, VT, Custom); setOperationAction(ISD::FSQRT, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setCondCodeAction(ISD::SETO, VT, Expand); setCondCodeAction(ISD::SETOLT, VT, Expand); setCondCodeAction(ISD::SETLT, VT, Expand); setCondCodeAction(ISD::SETOLE, VT, Expand); setCondCodeAction(ISD::SETLE, VT, Expand); setCondCodeAction(ISD::SETULT, VT, Expand); setCondCodeAction(ISD::SETULE, VT, Expand); setCondCodeAction(ISD::SETUGE, VT, Expand); setCondCodeAction(ISD::SETUGT, VT, Expand); setCondCodeAction(ISD::SETUEQ, VT, Expand); setCondCodeAction(ISD::SETONE, VT, Expand); } for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); } setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); // NEON doesn't support integer divides, but SVE does for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); } // NEON doesn't support 64-bit vector integer muls, but SVE does. setOperationAction(ISD::MUL, MVT::v1i64, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); // NEON doesn't support across-vector reductions, but SVE does. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); if (!Subtarget->isNeonAvailable()) { setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom); setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom); setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom); setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom); setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom); setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true); for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true); } // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) if (useSVEForFixedLengthVectorVT(VT)) addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false); for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) if (useSVEForFixedLengthVectorVT(VT)) addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false); // 64bit results can mean a bigger than NEON input. for (auto VT : {MVT::v8i8, MVT::v4i16}) setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); // 128bit results imply a bigger than NEON input. for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) setOperationAction(ISD::TRUNCATE, VT, Custom); for (auto VT : {MVT::v8f16, MVT::v4f32}) setOperationAction(ISD::FP_ROUND, VT, Custom); // These operations are not supported on NEON but SVE can do them. setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); setOperationAction(ISD::MULHS, MVT::v1i64, Custom); setOperationAction(ISD::MULHS, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v1i64, Custom); setOperationAction(ISD::MULHU, MVT::v2i64, Custom); setOperationAction(ISD::SMAX, MVT::v1i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Custom); setOperationAction(ISD::SMIN, MVT::v1i64, Custom); setOperationAction(ISD::SMIN, MVT::v2i64, Custom); setOperationAction(ISD::UMAX, MVT::v1i64, Custom); setOperationAction(ISD::UMAX, MVT::v2i64, Custom); setOperationAction(ISD::UMIN, MVT::v1i64, Custom); setOperationAction(ISD::UMIN, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); // Int operations with no NEON support. for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); } // Use SVE for vectors with more than 2 elements. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); } setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); setOperationAction(ISD::VSCALE, MVT::i32, Custom); } if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { // Only required for llvm.aarch64.mops.memset.tag setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); } setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); IsStrictFPEnabled = true; } void AArch64TargetLowering::addTypeForNEON(MVT VT) { assert(VT.isVector() && "VT should be a vector type"); if (VT.isFloatingPoint()) { MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); setOperationPromotedToType(ISD::STORE, VT, PromoteTo); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); } // But we do support custom-lowering for FCOPYSIGN. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); for (MVT InnerVT : MVT::all_valuetypes()) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // CNT supports only B element sizes, then use UADDLP to widen. if (VT != MVT::v8i8 && VT != MVT::v16i8) setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); for (unsigned Opcode : {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) setOperationAction(Opcode, VT, Custom); if (!VT.isFloatingPoint()) setOperationAction(ISD::ABS, VT, Legal); // [SU][MIN|MAX] are available for all NEON types apart from i64. if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP // NEON types. if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::bf16 && (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) for (unsigned Opcode : {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM, ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) setOperationAction(Opcode, VT, Legal); // Strict fp extend and trunc are legal if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16) setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64) setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); // FIXME: We could potentially make use of the vector comparison instructions // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of // complications: // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons, // so we would need to expand when the condition code doesn't match the // kind of comparison. // * Some kinds of comparison require more than one FCMXY instruction so // would need to be expanded instead. // * The lowering of the non-strict versions involves target-specific ISD // nodes so we would likely need to add strict versions of all of them and // handle them appropriately. setOperationAction(ISD::STRICT_FSETCC, VT, Expand); setOperationAction(ISD::STRICT_FSETCCS, VT, Expand); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); } } if (Subtarget->hasD128()) { setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom); } } bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, EVT OpVT) const { // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). if (!Subtarget->hasSVE()) return true; // We can only support legal predicate result types. We can use the SVE // whilelo instruction for generating fixed-width predicates too. if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 && ResVT != MVT::v8i1 && ResVT != MVT::v16i1) return true; // The whilelo instruction only works with i32 or i64 scalar inputs. if (OpVT != MVT::i32 && OpVT != MVT::i64) return true; return false; } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT, bool StreamingSVE) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); // By default everything must be expanded. for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) setOperationAction(Op, VT, Expand); if (VT.isFloatingPoint()) { setCondCodeAction(ISD::SETO, VT, Expand); setCondCodeAction(ISD::SETOLT, VT, Expand); setCondCodeAction(ISD::SETOLE, VT, Expand); setCondCodeAction(ISD::SETULT, VT, Expand); setCondCodeAction(ISD::SETULE, VT, Expand); setCondCodeAction(ISD::SETUGE, VT, Expand); setCondCodeAction(ISD::SETUGT, VT, Expand); setCondCodeAction(ISD::SETUEQ, VT, Expand); setCondCodeAction(ISD::SETONE, VT, Expand); } // Mark integer truncating stores/extending loads as having custom lowering if (VT.isInteger()) { MVT InnerVT = VT.changeVectorElementType(MVT::i8); while (InnerVT != VT) { setTruncStoreAction(VT, InnerVT, Custom); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); InnerVT = InnerVT.changeVectorElementType( MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); } } // Mark floating-point truncating stores/extending loads as having custom // lowering if (VT.isFloatingPoint()) { MVT InnerVT = VT.changeVectorElementType(MVT::f16); while (InnerVT != VT) { setTruncStoreAction(VT, InnerVT, Custom); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); InnerVT = InnerVT.changeVectorElementType( MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); } } // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::AND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::BSWAP, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FCEIL, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FDIV, VT, Custom); setOperationAction(ISD::FFLOOR, VT, Custom); setOperationAction(ISD::FMA, VT, Custom); setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMAXNUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMINNUM, VT, Custom); setOperationAction(ISD::FMUL, VT, Custom); setOperationAction(ISD::FNEARBYINT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FRINT, VT, Custom); setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::FROUNDEVEN, VT, Custom); setOperationAction(ISD::FSQRT, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); setOperationAction(ISD::FTRUNC, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom); setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom); setOperationAction(ISD::MSTORE, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, StreamingSVE ? Legal : Expand); setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::SMAX, VT, Custom); setOperationAction(ISD::SMIN, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::XOR, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); } void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT); } void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR128RegClass); addTypeForNEON(VT); } EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &C, EVT VT) const { if (!VT.isVector()) return MVT::i32; if (VT.isScalableVector()) return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } // isIntImmediate - This method tests to see if the node is a constant // operand. If so Imm will receive the value. static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { if (const ConstantSDNode *C = dyn_cast(N)) { Imm = C->getZExtValue(); return true; } return false; } // isOpcWithIntImmediate - This method tests to see if the node is a specific // opcode and that it has a immediate integer right operand. // If so Imm will receive the value. static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm) { return N->getOpcode() == Opc && isIntImmediate(N->getOperand(1).getNode(), Imm); } static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc) { uint64_t OldImm = Imm, NewImm, Enc; uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; // Return if the immediate is already all zeros, all ones, a bimm32 or a // bimm64. if (Imm == 0 || Imm == Mask || AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) return false; unsigned EltSize = Size; uint64_t DemandedBits = Demanded.getZExtValue(); // Clear bits that are not demanded. Imm &= DemandedBits; while (true) { // The goal here is to set the non-demanded bits in a way that minimizes // the number of switching between 0 and 1. In order to achieve this goal, // we set the non-demanded bits to the value of the preceding demanded bits. // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a // non-demanded bit), we copy bit0 (1) to the least significant 'x', // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. // The final result is 0b11000011. uint64_t NonDemandedBits = ~DemandedBits; uint64_t InvertedImm = ~Imm & DemandedBits; uint64_t RotatedImm = ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & NonDemandedBits; uint64_t Sum = RotatedImm + NonDemandedBits; bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); uint64_t Ones = (Sum + Carry) & NonDemandedBits; NewImm = (Imm | Ones) & Mask; // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate // or all-ones or all-zeros, in which case we can stop searching. Otherwise, // we halve the element size and continue the search. if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) break; // We cannot shrink the element size any further if it is 2-bits. if (EltSize == 2) return false; EltSize /= 2; Mask >>= EltSize; uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; // Return if there is mismatch in any of the demanded bits of Imm and Hi. if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) return false; // Merge the upper and lower halves of Imm and DemandedBits. Imm |= Hi; DemandedBits |= DemandedBitsHi; } ++NumOptimizedImms; // Replicate the element across the register width. while (EltSize < Size) { NewImm |= NewImm << EltSize; EltSize *= 2; } (void)OldImm; assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && "demanded bits should never be altered"); assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); // Create the new constant immediate node. EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue New; // If the new constant immediate is all-zeros or all-ones, let the target // independent DAG combine optimize this node. if (NewImm == 0 || NewImm == OrigMask) { New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), TLO.DAG.getConstant(NewImm, DL, VT)); // Otherwise, create a machine node so that target independent DAG combine // doesn't undo this optimization. } else { Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); New = SDValue( TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); } return TLO.CombineTo(Op, New); } bool AArch64TargetLowering::targetShrinkDemandedConstant( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const { // Delay this optimization to as late as possible. if (!TLO.LegalOps) return false; if (!EnableOptimizeLogicalImm) return false; EVT VT = Op.getValueType(); if (VT.isVector()) return false; unsigned Size = VT.getSizeInBits(); assert((Size == 32 || Size == 64) && "i32 or i64 is expected after legalization."); // Exit early if we demand all bits. if (DemandedBits.popcount() == Size) return false; unsigned NewOpc; switch (Op.getOpcode()) { default: return false; case ISD::AND: NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; break; case ISD::OR: NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; break; case ISD::XOR: NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; break; } ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; uint64_t Imm = C->getZExtValue(); return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them Known. void AArch64TargetLowering::computeKnownBitsForTargetNode( const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case AArch64ISD::DUP: { SDValue SrcOp = Op.getOperand(0); Known = DAG.computeKnownBits(SrcOp, Depth + 1); if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) { assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() && "Expected DUP implicit truncation"); Known = Known.trunc(Op.getScalarValueSizeInBits()); } break; } case AArch64ISD::CSEL: { KnownBits Known2; Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); Known = Known.intersectWith(Known2); break; } case AArch64ISD::BICi: { // Compute the bit cleared value. uint64_t Mask = ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); break; } case AArch64ISD::VLSHR: { KnownBits Known2; Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); Known = KnownBits::lshr(Known, Known2); break; } case AArch64ISD::VASHR: { KnownBits Known2; Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); Known = KnownBits::ashr(Known, Known2); break; } case AArch64ISD::MOVI: { ConstantSDNode *CN = cast(Op->getOperand(0)); Known = KnownBits::makeConstant(APInt(Known.getBitWidth(), CN->getZExtValue())); break; } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) break; // In ILP32 mode all valid pointers are in the low 4GB of the address-space. Known.Zero = APInt::getHighBitsSet(64, 32); break; } case AArch64ISD::ASSERT_ZEXT_BOOL: { Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known.Zero |= APInt(Known.getBitWidth(), 0xFE); break; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { unsigned BitWidth = Known.getBitWidth(); EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } break; } case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_VOID: { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: break; case Intrinsic::aarch64_neon_umaxv: case Intrinsic::aarch64_neon_uminv: { // Figure out the datatype of the vector operand. The UMINV instruction // will zero extend the result, so we can mark as known zero all the // bits larger than the element datatype. 32-bit or larget doesn't need // this as those are legal types and will be handled by isel directly. MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); unsigned BitWidth = Known.getBitWidth(); if (VT == MVT::v8i8 || VT == MVT::v16i8) { assert(BitWidth >= 8 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); Known.Zero |= Mask; } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { assert(BitWidth >= 16 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); Known.Zero |= Mask; } break; } break; } } } } unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { EVT VT = Op.getValueType(); unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case AArch64ISD::CMEQ: case AArch64ISD::CMGE: case AArch64ISD::CMGT: case AArch64ISD::CMHI: case AArch64ISD::CMHS: case AArch64ISD::FCMEQ: case AArch64ISD::FCMGE: case AArch64ISD::FCMGT: case AArch64ISD::CMEQz: case AArch64ISD::CMGEz: case AArch64ISD::CMGTz: case AArch64ISD::CMLEz: case AArch64ISD::CMLTz: case AArch64ISD::FCMEQz: case AArch64ISD::FCMGEz: case AArch64ISD::FCMGTz: case AArch64ISD::FCMLEz: case AArch64ISD::FCMLTz: // Compares return either 0 or all-ones return VTBits; } return 1; } MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, EVT) const { return MVT::i64; } bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const { if (Subtarget->requiresStrictAlign()) return false; if (Fast) { // Some CPUs are fine with unaligned stores except for 128-bit ones. *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. Alignment <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. VT == MVT::v2i64; } return true; } // Same as above but handling LLTs instead. bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const { if (Subtarget->requiresStrictAlign()) return false; if (Fast) { // Some CPUs are fine with unaligned stores except for 128-bit ones. *Fast = !Subtarget->isMisaligned128StoreSlow() || Ty.getSizeInBytes() != 16 || // See comments in performSTORECombine() for more details about // these conditions. // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. Alignment <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. Ty == LLT::fixed_vector(2, 64); } return true; } FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return AArch64::createFastISel(funcInfo, libInfo); } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { #define MAKE_CASE(V) \ case V: \ return #V; switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; MAKE_CASE(AArch64ISD::OBSCURE_COPY) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) MAKE_CASE(AArch64ISD::CALL) MAKE_CASE(AArch64ISD::ADRP) MAKE_CASE(AArch64ISD::ADR) MAKE_CASE(AArch64ISD::ADDlow) MAKE_CASE(AArch64ISD::LOADgot) MAKE_CASE(AArch64ISD::RET_GLUE) MAKE_CASE(AArch64ISD::BRCOND) MAKE_CASE(AArch64ISD::CSEL) MAKE_CASE(AArch64ISD::CSINV) MAKE_CASE(AArch64ISD::CSNEG) MAKE_CASE(AArch64ISD::CSINC) MAKE_CASE(AArch64ISD::THREAD_POINTER) MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ABDS_PRED) MAKE_CASE(AArch64ISD::ABDU_PRED) MAKE_CASE(AArch64ISD::HADDS_PRED) MAKE_CASE(AArch64ISD::HADDU_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::MULHS_PRED) MAKE_CASE(AArch64ISD::MULHU_PRED) MAKE_CASE(AArch64ISD::RHADDS_PRED) MAKE_CASE(AArch64ISD::RHADDU_PRED) MAKE_CASE(AArch64ISD::SDIV_PRED) MAKE_CASE(AArch64ISD::SHL_PRED) MAKE_CASE(AArch64ISD::SMAX_PRED) MAKE_CASE(AArch64ISD::SMIN_PRED) MAKE_CASE(AArch64ISD::SRA_PRED) MAKE_CASE(AArch64ISD::SRL_PRED) MAKE_CASE(AArch64ISD::UDIV_PRED) MAKE_CASE(AArch64ISD::UMAX_PRED) MAKE_CASE(AArch64ISD::UMIN_PRED) MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1) MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) MAKE_CASE(AArch64ISD::ADC) MAKE_CASE(AArch64ISD::SBC) MAKE_CASE(AArch64ISD::ADDS) MAKE_CASE(AArch64ISD::SUBS) MAKE_CASE(AArch64ISD::ADCS) MAKE_CASE(AArch64ISD::SBCS) MAKE_CASE(AArch64ISD::ANDS) MAKE_CASE(AArch64ISD::CCMP) MAKE_CASE(AArch64ISD::CCMN) MAKE_CASE(AArch64ISD::FCCMP) MAKE_CASE(AArch64ISD::FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMPE) MAKE_CASE(AArch64ISD::DUP) MAKE_CASE(AArch64ISD::DUPLANE8) MAKE_CASE(AArch64ISD::DUPLANE16) MAKE_CASE(AArch64ISD::DUPLANE32) MAKE_CASE(AArch64ISD::DUPLANE64) MAKE_CASE(AArch64ISD::DUPLANE128) MAKE_CASE(AArch64ISD::MOVI) MAKE_CASE(AArch64ISD::MOVIshift) MAKE_CASE(AArch64ISD::MOVIedit) MAKE_CASE(AArch64ISD::MOVImsl) MAKE_CASE(AArch64ISD::FMOV) MAKE_CASE(AArch64ISD::MVNIshift) MAKE_CASE(AArch64ISD::MVNImsl) MAKE_CASE(AArch64ISD::BICi) MAKE_CASE(AArch64ISD::ORRi) MAKE_CASE(AArch64ISD::BSP) MAKE_CASE(AArch64ISD::EXTR) MAKE_CASE(AArch64ISD::ZIP1) MAKE_CASE(AArch64ISD::ZIP2) MAKE_CASE(AArch64ISD::UZP1) MAKE_CASE(AArch64ISD::UZP2) MAKE_CASE(AArch64ISD::TRN1) MAKE_CASE(AArch64ISD::TRN2) MAKE_CASE(AArch64ISD::REV16) MAKE_CASE(AArch64ISD::REV32) MAKE_CASE(AArch64ISD::REV64) MAKE_CASE(AArch64ISD::EXT) MAKE_CASE(AArch64ISD::SPLICE) MAKE_CASE(AArch64ISD::VSHL) MAKE_CASE(AArch64ISD::VLSHR) MAKE_CASE(AArch64ISD::VASHR) MAKE_CASE(AArch64ISD::VSLI) MAKE_CASE(AArch64ISD::VSRI) MAKE_CASE(AArch64ISD::CMEQ) MAKE_CASE(AArch64ISD::CMGE) MAKE_CASE(AArch64ISD::CMGT) MAKE_CASE(AArch64ISD::CMHI) MAKE_CASE(AArch64ISD::CMHS) MAKE_CASE(AArch64ISD::FCMEQ) MAKE_CASE(AArch64ISD::FCMGE) MAKE_CASE(AArch64ISD::FCMGT) MAKE_CASE(AArch64ISD::CMEQz) MAKE_CASE(AArch64ISD::CMGEz) MAKE_CASE(AArch64ISD::CMGTz) MAKE_CASE(AArch64ISD::CMLEz) MAKE_CASE(AArch64ISD::CMLTz) MAKE_CASE(AArch64ISD::FCMEQz) MAKE_CASE(AArch64ISD::FCMGEz) MAKE_CASE(AArch64ISD::FCMGTz) MAKE_CASE(AArch64ISD::FCMLEz) MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::UADDV) MAKE_CASE(AArch64ISD::SDOT) MAKE_CASE(AArch64ISD::UDOT) MAKE_CASE(AArch64ISD::SMINV) MAKE_CASE(AArch64ISD::UMINV) MAKE_CASE(AArch64ISD::SMAXV) MAKE_CASE(AArch64ISD::UMAXV) MAKE_CASE(AArch64ISD::SADDV_PRED) MAKE_CASE(AArch64ISD::UADDV_PRED) MAKE_CASE(AArch64ISD::SMAXV_PRED) MAKE_CASE(AArch64ISD::UMAXV_PRED) MAKE_CASE(AArch64ISD::SMINV_PRED) MAKE_CASE(AArch64ISD::UMINV_PRED) MAKE_CASE(AArch64ISD::ORV_PRED) MAKE_CASE(AArch64ISD::EORV_PRED) MAKE_CASE(AArch64ISD::ANDV_PRED) MAKE_CASE(AArch64ISD::CLASTA_N) MAKE_CASE(AArch64ISD::CLASTB_N) MAKE_CASE(AArch64ISD::LASTA) MAKE_CASE(AArch64ISD::LASTB) MAKE_CASE(AArch64ISD::REINTERPRET_CAST) MAKE_CASE(AArch64ISD::LS64_BUILD) MAKE_CASE(AArch64ISD::LS64_EXTRACT) MAKE_CASE(AArch64ISD::TBL) MAKE_CASE(AArch64ISD::FADD_PRED) MAKE_CASE(AArch64ISD::FADDA_PRED) MAKE_CASE(AArch64ISD::FADDV_PRED) MAKE_CASE(AArch64ISD::FDIV_PRED) MAKE_CASE(AArch64ISD::FMA_PRED) MAKE_CASE(AArch64ISD::FMAX_PRED) MAKE_CASE(AArch64ISD::FMAXV_PRED) MAKE_CASE(AArch64ISD::FMAXNM_PRED) MAKE_CASE(AArch64ISD::FMAXNMV_PRED) MAKE_CASE(AArch64ISD::FMIN_PRED) MAKE_CASE(AArch64ISD::FMINV_PRED) MAKE_CASE(AArch64ISD::FMINNM_PRED) MAKE_CASE(AArch64ISD::FMINNMV_PRED) MAKE_CASE(AArch64ISD::FMUL_PRED) MAKE_CASE(AArch64ISD::FSUB_PRED) MAKE_CASE(AArch64ISD::RDSVL) MAKE_CASE(AArch64ISD::BIC) MAKE_CASE(AArch64ISD::BIT) MAKE_CASE(AArch64ISD::CBZ) MAKE_CASE(AArch64ISD::CBNZ) MAKE_CASE(AArch64ISD::TBZ) MAKE_CASE(AArch64ISD::TBNZ) MAKE_CASE(AArch64ISD::TC_RETURN) MAKE_CASE(AArch64ISD::PREFETCH) MAKE_CASE(AArch64ISD::SITOF) MAKE_CASE(AArch64ISD::UITOF) MAKE_CASE(AArch64ISD::NVCAST) MAKE_CASE(AArch64ISD::MRS) MAKE_CASE(AArch64ISD::SQSHL_I) MAKE_CASE(AArch64ISD::UQSHL_I) MAKE_CASE(AArch64ISD::SRSHR_I) MAKE_CASE(AArch64ISD::URSHR_I) MAKE_CASE(AArch64ISD::SQSHLU_I) MAKE_CASE(AArch64ISD::WrapperLarge) MAKE_CASE(AArch64ISD::LD2post) MAKE_CASE(AArch64ISD::LD3post) MAKE_CASE(AArch64ISD::LD4post) MAKE_CASE(AArch64ISD::ST2post) MAKE_CASE(AArch64ISD::ST3post) MAKE_CASE(AArch64ISD::ST4post) MAKE_CASE(AArch64ISD::LD1x2post) MAKE_CASE(AArch64ISD::LD1x3post) MAKE_CASE(AArch64ISD::LD1x4post) MAKE_CASE(AArch64ISD::ST1x2post) MAKE_CASE(AArch64ISD::ST1x3post) MAKE_CASE(AArch64ISD::ST1x4post) MAKE_CASE(AArch64ISD::LD1DUPpost) MAKE_CASE(AArch64ISD::LD2DUPpost) MAKE_CASE(AArch64ISD::LD3DUPpost) MAKE_CASE(AArch64ISD::LD4DUPpost) MAKE_CASE(AArch64ISD::LD1LANEpost) MAKE_CASE(AArch64ISD::LD2LANEpost) MAKE_CASE(AArch64ISD::LD3LANEpost) MAKE_CASE(AArch64ISD::LD4LANEpost) MAKE_CASE(AArch64ISD::ST2LANEpost) MAKE_CASE(AArch64ISD::ST3LANEpost) MAKE_CASE(AArch64ISD::ST4LANEpost) MAKE_CASE(AArch64ISD::SMULL) MAKE_CASE(AArch64ISD::UMULL) MAKE_CASE(AArch64ISD::PMULL) MAKE_CASE(AArch64ISD::FRECPE) MAKE_CASE(AArch64ISD::FRECPS) MAKE_CASE(AArch64ISD::FRSQRTE) MAKE_CASE(AArch64ISD::FRSQRTS) MAKE_CASE(AArch64ISD::STG) MAKE_CASE(AArch64ISD::STZG) MAKE_CASE(AArch64ISD::ST2G) MAKE_CASE(AArch64ISD::STZ2G) MAKE_CASE(AArch64ISD::SUNPKHI) MAKE_CASE(AArch64ISD::SUNPKLO) MAKE_CASE(AArch64ISD::UUNPKHI) MAKE_CASE(AArch64ISD::UUNPKLO) MAKE_CASE(AArch64ISD::INSR) MAKE_CASE(AArch64ISD::PTEST) MAKE_CASE(AArch64ISD::PTEST_ANY) MAKE_CASE(AArch64ISD::PTRUE) MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::ST1_PRED) MAKE_CASE(AArch64ISD::SST1_PRED) MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) MAKE_CASE(AArch64ISD::SST1_IMM_PRED) MAKE_CASE(AArch64ISD::SSTNT1_PRED) MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) MAKE_CASE(AArch64ISD::LDP) MAKE_CASE(AArch64ISD::LDIAPP) MAKE_CASE(AArch64ISD::LDNP) MAKE_CASE(AArch64ISD::STP) MAKE_CASE(AArch64ISD::STILP) MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::INDEX_VECTOR) MAKE_CASE(AArch64ISD::ADDP) MAKE_CASE(AArch64ISD::SADDLP) MAKE_CASE(AArch64ISD::UADDLP) MAKE_CASE(AArch64ISD::CALL_RVMARKER) MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) MAKE_CASE(AArch64ISD::MOPS_MEMSET) MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING) MAKE_CASE(AArch64ISD::MOPS_MEMCOPY) MAKE_CASE(AArch64ISD::MOPS_MEMMOVE) MAKE_CASE(AArch64ISD::CALL_BTI) MAKE_CASE(AArch64ISD::MRRS) MAKE_CASE(AArch64ISD::MSRR) } #undef MAKE_CASE return nullptr; } MachineBasicBlock * AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: // OrigBB: // [... previous instrs leading to comparison ...] // b.ne TrueBB // b EndBB // TrueBB: // ; Fallthrough // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); Register DestReg = MI.getOperand(0).getReg(); Register IfTrueReg = MI.getOperand(1).getReg(); Register IfFalseReg = MI.getOperand(2).getReg(); unsigned CondCode = MI.getOperand(3).getImm(); bool NZCVKilled = MI.getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, TrueBB); MF->insert(It, EndBB); // Transfer rest of current basic-block to EndBB EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndBB->transferSuccessorsAndUpdatePHIs(MBB); BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MBB->addSuccessor(TrueBB); MBB->addSuccessor(EndBB); // TrueBB falls through to the end. TrueBB->addSuccessor(EndBB); if (!NZCVKilled) { TrueBB->addLiveIn(AArch64::NZCV); EndBB->addLiveIn(AArch64::NZCV); } BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) .addReg(IfTrueReg) .addMBB(TrueBB) .addReg(IfFalseReg) .addMBB(MBB); MI.eraseFromParent(); return EndBB; } MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( MachineInstr &MI, MachineBasicBlock *BB) const { assert(!isAsynchronousEHPersonality(classifyEHPersonality( BB->getParent()->getFunction().getPersonalityFn())) && "SEH does not use catchret!"); return BB; } MachineBasicBlock * AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); MIB.add(MI.getOperand(1)); // slice index register MIB.add(MI.getOperand(2)); // slice index offset MIB.add(MI.getOperand(3)); // pg MIB.add(MI.getOperand(4)); // base MIB.add(MI.getOperand(5)); // offset MI.eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); MIB.addReg(AArch64::ZA, RegState::Define); MIB.add(MI.getOperand(0)); // Vector select register MIB.add(MI.getOperand(1)); // Vector select offset MIB.add(MI.getOperand(2)); // Base MIB.add(MI.getOperand(1)); // Offset, same as vector select offset MI.eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); unsigned StartIdx = 0; if (HasTile) { MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); MIB.addReg(BaseReg + MI.getOperand(0).getImm()); StartIdx = 1; } else MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); MI.eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M)); MIB.add(MI.getOperand(0)); // Mask unsigned Mask = MI.getOperand(0).getImm(); for (unsigned I = 0; I < 8; I++) { if (Mask & (1 << I)) MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine); } MI.eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode()); if (SMEOrigInstr != -1) { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); uint64_t SMEMatrixType = TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; switch (SMEMatrixType) { case (AArch64::SMEMatrixArray): return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); case (AArch64::SMEMatrixTileB): return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileH): return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileS): return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileD): return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileQ): return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); } } switch (MI.getOpcode()) { default: #ifndef NDEBUG MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STATEPOINT: // STATEPOINT is a pseudo instruction which has no implicit defs/uses // while bl call instruction (where statepoint will be lowered at the end) // has implicit def. This def is early-clobber as it will be set at // the moment of the call and earlier than any use is read. // Add this implicit dead def here as a workaround. MI.addOperand(*MI.getMF(), MachineOperand::CreateReg( AArch64::LR, /*isDef*/ true, /*isImp*/ true, /*isKill*/ false, /*isDead*/ true, /*isUndef*/ false, /*isEarlyClobber*/ true)); [[fallthrough]]; case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case TargetOpcode::PATCHABLE_EVENT_CALL: case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: return BB; case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_H: return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_S: return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_D: return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_Q: return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB); case AArch64::LD1_MXIPXX_V_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB); case AArch64::LD1_MXIPXX_V_PSEUDO_H: return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB); case AArch64::LD1_MXIPXX_V_PSEUDO_S: return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB); case AArch64::LD1_MXIPXX_V_PSEUDO_D: return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); case AArch64::LD1_MXIPXX_V_PSEUDO_Q: return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); case AArch64::LDR_ZA_PSEUDO: return EmitFill(MI, BB); case AArch64::ZERO_M_PSEUDO: return EmitZero(MI, BB); } } //===----------------------------------------------------------------------===// // AArch64 Lowering private implementation. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// // Forward declarations of SVE fixed length lowering helpers static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT); static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG); static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT); /// isZerosVector - Check whether SDNode N is a zero-filled vector. static bool isZerosVector(const SDNode *N) { // Look through a bit convert. while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (ISD::isConstantSplatVectorAllZeros(N)) return true; if (N->getOpcode() != AArch64ISD::DUP) return false; auto Opnd0 = N->getOperand(0); return isNullConstant(Opnd0) || isNullFPConstant(Opnd0); } /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 /// CC static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return AArch64CC::NE; case ISD::SETEQ: return AArch64CC::EQ; case ISD::SETGT: return AArch64CC::GT; case ISD::SETGE: return AArch64CC::GE; case ISD::SETLT: return AArch64CC::LT; case ISD::SETLE: return AArch64CC::LE; case ISD::SETUGT: return AArch64CC::HI; case ISD::SETUGE: return AArch64CC::HS; case ISD::SETULT: return AArch64CC::LO; case ISD::SETULE: return AArch64CC::LS; } } /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = AArch64CC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = AArch64CC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = AArch64CC::GE; break; case ISD::SETOLT: CondCode = AArch64CC::MI; break; case ISD::SETOLE: CondCode = AArch64CC::LS; break; case ISD::SETONE: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GT; break; case ISD::SETO: CondCode = AArch64CC::VC; break; case ISD::SETUO: CondCode = AArch64CC::VS; break; case ISD::SETUEQ: CondCode = AArch64CC::EQ; CondCode2 = AArch64CC::VS; break; case ISD::SETUGT: CondCode = AArch64CC::HI; break; case ISD::SETUGE: CondCode = AArch64CC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = AArch64CC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = AArch64CC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = AArch64CC::NE; break; } } /// Convert a DAG fp condition code to an AArch64 CC. /// This differs from changeFPCCToAArch64CC in that it returns cond codes that /// should be AND'ed instead of OR'ed. static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: changeFPCCToAArch64CC(CC, CondCode, CondCode2); assert(CondCode2 == AArch64CC::AL); break; case ISD::SETONE: // (a one b) // == ((a olt b) || (a ogt b)) // == ((a ord b) && (a une b)) CondCode = AArch64CC::VC; CondCode2 = AArch64CC::NE; break; case ISD::SETUEQ: // (a ueq b) // == ((a uno b) || (a oeq b)) // == ((a ule b) && (a uge b)) CondCode = AArch64CC::PL; CondCode2 = AArch64CC::LE; break; } } /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations /// to get the same effect. static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert) { Invert = false; switch (CC) { default: // Mostly the scalar mappings work fine. changeFPCCToAArch64CC(CC, CondCode, CondCode2); break; case ISD::SETUO: Invert = true; [[fallthrough]]; case ISD::SETO: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GE; break; case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE: case ISD::SETUGT: case ISD::SETUGE: // All of the compare-mask comparisons are ordered, but we can switch // between the two by a double inversion. E.g. ULE == !OGT. Invert = true; changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), CondCode, CondCode2); break; } } static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); LLVM_DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n")); return IsLegal; } // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags // can be set differently by this operation. It comes down to whether // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then // everything is fine. If not then the optimization is wrong. Thus general // comparisons are only valid if op2 != 0. // // So, finally, the only LLVM-native comparisons that don't mention C and V // are SETEQ and SETNE. They're the only ones we can safely use CMN for in // the absence of information about op2. static bool isCMN(SDValue Op, ISD::CondCode CC) { return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE); } static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling) { EVT VT = LHS.getValueType(); assert(VT != MVT::f128); const bool FullFP16 = DAG.getSubtarget().hasFullFP16(); if (VT == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, {Chain, LHS}); RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, {LHS.getValue(1), RHS}); Chain = RHS.getValue(1); VT = MVT::f32; } unsigned Opcode = IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); const bool FullFP16 = DAG.getSubtarget().hasFullFP16(); if (VT.isFloatingPoint()) { assert(VT != MVT::f128); if (VT == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); VT = MVT::f32; } return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); } // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. // A later phase can perform the optimization of setting the destination // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; if (isCMN(RHS, CC)) { // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); } else if (isCMN(LHS, CC)) { // As we are looking for EQ/NE compares, the operands can be commuted ; can // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; LHS = LHS.getOperand(1); } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { if (LHS.getOpcode() == ISD::AND) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one // of the signed comparisons. const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, DAG.getVTList(VT, MVT_CC), LHS.getOperand(0), LHS.getOperand(1)); // Replace all users of (and X, Y) with newly generated (ands X, Y) DAG.ReplaceAllUsesWith(LHS, ANDSNode); return ANDSNode.getValue(1); } else if (LHS.getOpcode() == AArch64ISD::ANDS) { // Use result of ANDS return LHS.getValue(1); } } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } /// \defgroup AArch64CCMP CMP;CCMP matching /// /// These functions deal with the formation of CMP;CCMP;... sequences. /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of /// a comparison. They set the NZCV flags to a predefined value if their /// predicate is false. This allows to express arbitrary conjunctions, for /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" /// expressed as: /// cmp A /// ccmp B, inv(CB), CA /// check for CB flags /// /// This naturally lets us implement chains of AND operations with SETCC /// operands. And we can even implement some other situations by transforming /// them: /// - We can implement (NEG SETCC) i.e. negating a single comparison by /// negating the flags used in a CCMP/FCCMP operations. /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations /// by negating the flags we test for afterwards. i.e. /// NEG (CMP CCMP CCCMP ...) can be implemented. /// - Note that we can only ever negate all previously processed results. /// What we can not implement by flipping the flags to test is a negation /// of two sub-trees (because the negation affects all sub-trees emitted so /// far, so the 2nd sub-tree we emit would also affect the first). /// With those tools we can implement some OR operations: /// - (OR (SETCC A) (SETCC B)) can be implemented via: /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) /// - After transforming OR to NEG/AND combinations we may be able to use NEG /// elimination rules from earlier to implement the whole thing as a /// CCMP/FCCMP chain. /// /// As complete example: /// or (or (setCA (cmp A)) (setCB (cmp B))) /// (and (setCC (cmp C)) (setCD (cmp D)))" /// can be reassociated to: /// or (and (setCC (cmp C)) setCD (cmp D)) // (or (setCA (cmp A)) (setCB (cmp B))) /// can be transformed to: /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" /// which can be implemented as: /// cmp C /// ccmp D, inv(CD), CC /// ccmp A, CA, inv(CD) /// ccmp B, CB, inv(CA) /// check for CB flags /// /// A counterexample is "or (and A B) (and C D)" which translates to /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we /// can only implement 1 of the inner (not) operations, but not both! /// @{ /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; const bool FullFP16 = DAG.getSubtarget().hasFullFP16(); if (LHS.getValueType().isFloatingPoint()) { assert(LHS.getValueType() != MVT::f128); if (LHS.getValueType() == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } Opcode = AArch64ISD::FCCMP; } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // See emitComparison() on why we can only do this for SETEQ and SETNE. Opcode = AArch64ISD::CCMN; RHS = RHS.getOperand(1); } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be /// expressed as a conjunction. See \ref AArch64CCMP. /// \param CanNegate Set to true if we can negate the whole sub-tree just by /// changing the conditions on the SETCC tests. /// (this means we can call emitConjunctionRec() with /// Negate==true on this sub-tree) /// \param MustBeFirst Set to true if this subtree needs to be negated and we /// cannot do the negation naturally. We are required to /// emit the subtree first in this case. /// \param WillNegate Is true if are called when the result of this /// subexpression must be negated. This happens when the /// outer expression is an OR. We can use this fact to know /// that we have a double negation (or (or ...) ...) that /// can be implemented for free. static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { if (Val->getOperand(0).getValueType() == MVT::f128) return false; CanNegate = true; MustBeFirst = false; return true; } // Protect against exponential runtime and stack overflow. if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { bool IsOR = Opcode == ISD::OR; SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanNegateL; bool MustBeFirstL; if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) return false; bool CanNegateR; bool MustBeFirstR; if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) return false; if (MustBeFirstL && MustBeFirstR) return false; if (IsOR) { // For an OR expression we need to be able to naturally negate at least // one side or we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; // If we the result of the OR will be negated and we can naturally negate // the leafs, then this sub-tree as a whole negates naturally. CanNegate = WillNegate && CanNegateL && CanNegateR; // If we cannot naturally negate the whole sub-tree, then this must be // emitted first. MustBeFirst = !CanNegate; } else { assert(Opcode == ISD::AND && "Must be OR or AND"); // We cannot naturally negate an AND operation. CanNegate = false; MustBeFirst = MustBeFirstL || MustBeFirstR; } return true; } return false; } /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain /// of CCMP/CFCMP ops. See @ref AArch64CCMP. /// Tries to transform the given i1 producing node @p Val to a series compare /// and conditional compare operations. @returns an NZCV flags producing node /// and sets @p OutCC to the flags that should be tested or returns SDValue() if /// transformation was not possible. /// \p Negate is true if we want this sub-tree being negated just by changing /// SETCC conditions. static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); if (Negate) CC = getSetCCInverse(CC, LHS.getValueType()); SDLoc DL(Val); // Determine OutCC and handle FP special case. if (isInteger) { OutCC = changeIntCCToAArch64CC(CC); } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); // Some floating point conditions can't be tested with a single condition // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); else ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, ExtraCC, DL, DAG); CCOp = ExtraCmp; Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain if (!CCOp) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); } assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); bool IsOR = Opcode == ISD::OR; SDValue LHS = Val->getOperand(0); bool CanNegateL; bool MustBeFirstL; bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); assert(ValidL && "Valid conjunction/disjunction tree"); (void)ValidL; SDValue RHS = Val->getOperand(1); bool CanNegateR; bool MustBeFirstR; bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); assert(ValidR && "Valid conjunction/disjunction tree"); (void)ValidR; // Swap sub-tree that must come first to the right side. if (MustBeFirstL) { assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); std::swap(LHS, RHS); std::swap(CanNegateL, CanNegateR); std::swap(MustBeFirstL, MustBeFirstR); } bool NegateR; bool NegateAfterR; bool NegateL; bool NegateAfterAll; if (Opcode == ISD::OR) { // Swap the sub-tree that we can negate naturally to the left. if (!CanNegateL) { assert(CanNegateR && "at least one side must be negatable"); assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); assert(!Negate); std::swap(LHS, RHS); NegateR = false; NegateAfterR = true; } else { // Negate the left sub-tree if possible, otherwise negate the result. NegateR = CanNegateR; NegateAfterR = !CanNegateR; } NegateL = true; NegateAfterAll = !Negate; } else { assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); assert(!Negate && "Valid conjunction/disjunction tree"); NegateL = false; NegateR = false; NegateAfterR = false; NegateAfterAll = false; } // Emit sub-trees. AArch64CC::CondCode RHSCC; SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); if (NegateAfterR) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); if (NegateAfterAll) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). /// In some cases this is even possible with OR operations in the expression. /// See \ref AArch64CCMP. /// \see emitConjunctionRec(). static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC) { bool DummyCanNegate; bool DummyMustBeFirst; if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) return SDValue(); return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); } /// @} /// Returns how profitable it is to fold a comparison's operand's shift and/or /// extension operations. static unsigned getCmpOperandFoldingProfit(SDValue Op) { auto isSupportedExtend = [&](SDValue V) { if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) return true; if (V.getOpcode() == ISD::AND) if (ConstantSDNode *MaskCst = dyn_cast(V.getOperand(1))) { uint64_t Mask = MaskCst->getZExtValue(); return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); } return false; }; if (!Op.hasOneUse()) return 0; if (isSupportedExtend(Op)) return 1; unsigned Opc = Op.getOpcode(); if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) if (ConstantSDNode *ShiftCst = dyn_cast(Op.getOperand(1))) { uint64_t Shift = ShiftCst->getZExtValue(); if (isSupportedExtend(Op.getOperand(0))) return (Shift <= 4) ? 2 : 1; EVT VT = Op.getValueType(); if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) return 1; } return 0; } static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl) { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); if (!isLegalArithImmed(C)) { // Constant does not fit, try adjusting it by one? switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if ((VT == MVT::i32 && C != 0x80000000 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0x80000000ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULT: case ISD::SETUGE: if ((VT == MVT::i32 && C != 0 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETLE: case ISD::SETGT: if ((VT == MVT::i32 && C != INT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != INT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULE: case ISD::SETUGT: if ((VT == MVT::i32 && C != UINT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != UINT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; } } } // Comparisons are canonicalized so that the RHS operand is simpler than the // LHS one, the extreme case being when RHS is an immediate. However, AArch64 // can fold some shift+extend operations on the RHS operand, so swap the // operands if that can be done. // // For example: // lsl w13, w11, #1 // cmp w13, w12 // can be turned into: // cmp w12, w11, lsl #1 if (!isa(RHS) || !isLegalArithImmed(cast(RHS)->getZExtValue())) { SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } } SDValue Cmp; AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { const ConstantSDNode *RHSC = cast(RHS); // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. // For the i8 operand, the largest immediate is 255, so this can be easily // encoded in the compare instruction. For the i16 operand, however, the // largest immediate cannot be encoded in the compare. // Therefore, use a sign extending load and cmn to avoid materializing the // -1 constant. For example, // movz w1, #65535 // ldrh w0, [x0, #0] // cmp w0, w1 // > // ldrsh w0, [x0, #0] // cmn w0, #1 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) // if and only if (sext LHS) == (sext RHS). The checks are in place to // ensure both the LHS and RHS are truly zero extended and to make sure the // transformation is profitable. if ((RHSC->getZExtValue() >> 16 == 0) && isa(LHS) && cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && cast(LHS)->getMemoryVT() == MVT::i16 && LHS.getNode()->hasNUsesOfValue(1, 0)) { int16_t ValueofRHS = cast(RHS)->getZExtValue(); if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, DAG.getValueType(MVT::i16)); Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, RHS.getValueType()), CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } } if (!Cmp && (RHSC->isZero() || RHSC->isOne())) { if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { if ((CC == ISD::SETNE) ^ RHSC->isZero()) AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } } } if (!Cmp) { Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } static std::pair getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && "Unsupported value type"); SDValue Value, Overflow; SDLoc DL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned Opc = 0; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::VS; break; case ISD::UADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::HS; break; case ISD::SSUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::VS; break; case ISD::USUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::LO; break; // Multiply needs a little bit extra work. case ISD::SMULO: case ISD::UMULO: { CC = AArch64CC::NE; bool IsSigned = Op.getOpcode() == ISD::SMULO; if (Op.getValueType() == MVT::i32) { // Extend to 64-bits, then perform a 64-bit multiply. unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); // Check that the result fits into a 32-bit integer. SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); if (IsSigned) { // cmp xreg, wreg, sxtw SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1); } else { // tst xreg, #0xffffffff00000000 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); Overflow = DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1); } break; } assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); // For the 64 bit multiply Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); if (IsSigned) { SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } } // switch (...) if (Opc) { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); // Emit the AArch64 operation with overflow check. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); SDValue Sel = Op.getOperand(0); SDValue Other = Op.getOperand(1); SDLoc dl(Sel); // If the operand is an overflow checking operation, invert the condition // code and kill the Not operation. I.e., transform: // (xor (overflow_op_bool, 1)) // --> // (csel 1, 0, invert(cc), overflow_op_bool) // ... which later gets transformed to just a cset instruction with an // inverted condition code, rather than a cset + eor sequence. if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) return SDValue(); SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); AArch64CC::CondCode CC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // If neither operand is a SELECT_CC, give up. if (Sel.getOpcode() != ISD::SELECT_CC) std::swap(Sel, Other); if (Sel.getOpcode() != ISD::SELECT_CC) return Op; // The folding we want to perform is: // (xor x, (select_cc a, b, cc, 0, -1) ) // --> // (csel x, (xor x, -1), cc ...) // // The latter will get matched to a CSINV instruction. ISD::CondCode CC = cast(Sel.getOperand(4))->get(); SDValue LHS = Sel.getOperand(0); SDValue RHS = Sel.getOperand(1); SDValue TVal = Sel.getOperand(2); SDValue FVal = Sel.getOperand(3); // FIXME: This could be generalized to non-integer comparisons. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return Op; ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); // The values aren't constants, this isn't the pattern we're looking for. if (!CFVal || !CTVal) return Op; // We can commute the SELECT_CC by inverting the condition. This // might be needed to make this fit into a CSINV pattern. if (CTVal->isAllOnes() && CFVal->isZero()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } // If the constants line up, perform the transform! if (CTVal->isZero() && CFVal->isAllOnes()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); FVal = Other; TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, DAG.getConstant(-1ULL, dl, Other.getValueType())); return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, CCVal, Cmp); } return Op; } // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C' // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else // sets 'C' bit to 0. static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { SDLoc DL(Value); EVT VT = Value.getValueType(); SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); return Cmp.getValue(1); } // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0. // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1. static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert) { assert(Glue.getResNo() == 1); SDLoc DL(Glue); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); } // Value is 1 if 'V' bit of NZCV is 1, else 0 static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) { assert(Glue.getResNo() == 1); SDLoc DL(Glue); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); } // This lowering is inefficient, but it will get cleaned up by // `foldOverflowCheck` static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned) { EVT VT0 = Op.getValue(0).getValueType(); EVT VT1 = Op.getValue(1).getValueType(); if (VT0 != MVT::i32 && VT0 != MVT::i64) return SDValue(); bool InvertCarry = Opcode == AArch64ISD::SBCS; SDValue OpLHS = Op.getOperand(0); SDValue OpRHS = Op.getOperand(1); SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); SDLoc DL(Op); SDVTList VTs = DAG.getVTList(VT0, VT1); SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, OpRHS, OpCarryIn); SDValue OutFlag = IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDLoc dl(Op); AArch64CC::CondCode CC; // The actual operation that sets the overflow or carry flag. SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, CCVal, Overflow); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } // Prefetch operands are: // 1: Address to prefetch // 2: bool isWrite // 3: int locality (0 = no locality ... 3 = extreme locality) // 4: bool isDataCache static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); unsigned IsWrite = cast(Op.getOperand(2))->getZExtValue(); unsigned Locality = cast(Op.getOperand(3))->getZExtValue(); unsigned IsData = cast(Op.getOperand(4))->getZExtValue(); bool IsStream = !Locality; // When the locality number is set if (Locality) { // The front-end should have filtered out the out-of-range values assert(Locality <= 3 && "Prefetch locality out-of-range"); // The locality degree is the opposite of the cache speed. // Put the number the other way around. // The encoding starts at 0 for level 1 Locality = 3 - Locality; } // built the mask value encoding the expected behavior. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit (!IsData << 3) | // IsDataCache bit (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), DAG.getTargetConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPExtendToSVE(Op, DAG); assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); return SDValue(); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = SrcVal.getValueType(); if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPRoundToSVE(Op, DAG); if (SrcVT != MVT::f128) { // Expand cases where the input is a vector bigger than NEON. if (useSVEForFixedLengthVectorVT(SrcVT)) return SDValue(); // It's legal except when f128 is involved return Op; } return SDValue(); } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. bool IsStrict = Op->isStrictFPOpcode(); EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType(); EVT VT = Op.getValueType(); if (VT.isScalableVector()) { unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT ? AArch64ISD::FCVTZU_MERGE_PASSTHRU : AArch64ISD::FCVTZS_MERGE_PASSTHRU; return LowerToPredicatedOp(Op, DAG, Opcode); } if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPToIntToSVE(Op, DAG); unsigned NumElts = InVT.getVectorNumElements(); // f16 conversions are promoted to f32 when full fp16 is not supported. if (InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); if (IsStrict) { SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, {Op.getOperand(0), Op.getOperand(1)}); return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, {Ext.getValue(1), Ext.getValue(0)}); } return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); } uint64_t VTSize = VT.getFixedSizeInBits(); uint64_t InVTSize = InVT.getFixedSizeInBits(); if (VTSize < InVTSize) { SDLoc dl(Op); if (IsStrict) { InVT = InVT.changeVectorElementTypeToInteger(); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other}, {Op.getOperand(0), Op.getOperand(1)}); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl); } SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); } if (VTSize > InVTSize) { SDLoc dl(Op); MVT ExtVT = MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); if (IsStrict) { SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other}, {Op.getOperand(0), Op.getOperand(1)}); return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, {Ext.getValue(1), Ext.getValue(0)}); } SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } // Use a scalar operation for conversions between single-element vectors of // the same size. if (NumElts == 1) { SDLoc dl(Op); SDValue Extract = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64)); EVT ScalarVT = VT.getScalarType(); if (IsStrict) return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, {Op.getOperand(0), Extract}); return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); } // Type changing conversions are illegal. return Op; } SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); if (SrcVal.getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); if (IsStrict) { SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, {Op.getOperand(0), SrcVal}); return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other}, {Ext.getValue(1), Ext.getValue(0)}); } return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); } if (SrcVal.getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } return SDValue(); } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // AArch64 FP-to-int conversions saturate to the destination element size, so // we can lower common saturating conversions to simple instructions. SDValue SrcVal = Op.getOperand(0); EVT SrcVT = SrcVal.getValueType(); EVT DstVT = Op.getValueType(); EVT SatVT = cast(Op.getOperand(1))->getVT(); uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits(); uint64_t DstElementWidth = DstVT.getScalarSizeInBits(); uint64_t SatWidth = SatVT.getScalarSizeInBits(); assert(SatWidth <= DstElementWidth && "Saturation width cannot exceed result width"); // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT. // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable // types, so this is hard to reach. if (DstVT.isScalableVector()) return SDValue(); EVT SrcElementVT = SrcVT.getVectorElementType(); // In the absence of FP16 support, promote f16 to f32 and saturate the result. if (SrcElementVT == MVT::f16 && (!Subtarget->hasFullFP16() || DstElementWidth > 16)) { MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements()); SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal); SrcVT = F32VT; SrcElementVT = MVT::f32; SrcElementWidth = 32; } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 && SrcElementVT != MVT::f16) return SDValue(); SDLoc DL(Op); // Cases that we can emit directly. if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT.getScalarType())); // Otherwise we emit a cvt that saturates to a higher BW, and saturate the // result. This is only valid if the legal cvt is larger than the saturate // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize // (at least until sqxtn is selected). if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64) return SDValue(); EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal, DAG.getValueType(IntVT.getScalarType())); SDValue Sat; if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { SDValue MinC = DAG.getConstant( APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT); SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC); SDValue MaxC = DAG.getConstant( APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT); Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC); } else { SDValue MinC = DAG.getConstant( APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT); Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC); } return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); } SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // AArch64 FP-to-int conversions saturate to the destination register size, so // we can lower common saturating conversions to simple instructions. SDValue SrcVal = Op.getOperand(0); EVT SrcVT = SrcVal.getValueType(); if (SrcVT.isVector()) return LowerVectorFP_TO_INT_SAT(Op, DAG); EVT DstVT = Op.getValueType(); EVT SatVT = cast(Op.getOperand(1))->getVT(); uint64_t SatWidth = SatVT.getScalarSizeInBits(); uint64_t DstWidth = DstVT.getScalarSizeInBits(); assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); // In the absence of FP16 support, promote f16 to f32 and saturate the result. if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) { SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal); SrcVT = MVT::f32; } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16) return SDValue(); SDLoc DL(Op); // Cases that we can emit directly. if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 || (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) && DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32)) return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); // Otherwise we emit a cvt that saturates to a higher BW, and saturate the // result. This is only valid if the legal cvt is larger than the saturate // width. if (DstWidth < SatWidth) return SDValue(); SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); SDValue Sat; if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { SDValue MinC = DAG.getConstant( APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT); SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC); SDValue MaxC = DAG.getConstant( APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT); Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC); } else { SDValue MinC = DAG.getConstant( APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT); Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC); } return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); } SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. bool IsStrict = Op->isStrictFPOpcode(); EVT VT = Op.getValueType(); SDLoc dl(Op); SDValue In = Op.getOperand(IsStrict ? 1 : 0); EVT InVT = In.getValueType(); unsigned Opc = Op.getOpcode(); bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; if (VT.isScalableVector()) { if (InVT.getVectorElementType() == MVT::i1) { // We can't directly extend an SVE predicate; extend it first. unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = getPromotedVTForPredicate(InVT); In = DAG.getNode(CastOpc, dl, CastVT, In); return DAG.getNode(Opc, dl, VT, In); } unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; return LowerToPredicatedOp(Op, DAG, Opcode); } if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) return LowerFixedLengthIntToFPToSVE(Op, DAG); uint64_t VTSize = VT.getFixedSizeInBits(); uint64_t InVTSize = InVT.getFixedSizeInBits(); if (VTSize < InVTSize) { MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); if (IsStrict) { In = DAG.getNode(Opc, dl, {CastVT, MVT::Other}, {Op.getOperand(0), In}); return DAG.getNode( ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)}); } In = DAG.getNode(Opc, dl, CastVT, In); return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); } if (VTSize > InVTSize) { unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = VT.changeVectorElementTypeToInteger(); In = DAG.getNode(CastOpc, dl, CastVT, In); if (IsStrict) return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In}); return DAG.getNode(Opc, dl, VT, In); } // Use a scalar operation for conversions between single-element vectors of // the same size. if (VT.getVectorNumElements() == 1) { SDValue Extract = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), In, DAG.getConstant(0, dl, MVT::i64)); EVT ScalarVT = VT.getScalarType(); if (IsStrict) return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, {Op.getOperand(0), Extract}); return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); } return Op; } SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); if (IsStrict) { SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other}, {Op.getOperand(0), SrcVal}); return DAG.getNode( ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other}, {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)}); } return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. if (SrcVal.getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based // fp128. if (Op.getValueType() != MVT::f128) return Op; return SDValue(); } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { // For iOS, we want to call an alternative entry point: __sincos_stret, // which returns the values in two S / D registers. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); ArgListTy Args; ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } static MVT getSVEContainerType(EVT ContentTy); SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT OpVT = Op.getValueType(); EVT ArgVT = Op.getOperand(0).getValueType(); if (useSVEForFixedLengthVectorVT(OpVT)) return LowerFixedLengthBitcastToSVE(Op, DAG); if (OpVT.isScalableVector()) { // Bitcasting between unpacked vector types of different element counts is // not a NOP because the live elements are laid out differently. // 01234567 // e.g. nxv2i32 = XX??XX?? // nxv4f16 = X?X?X?X? if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount()) return SDValue(); if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && "Expected int->fp bitcast!"); SDValue ExtResult = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT), Op.getOperand(0)); return getSVESafeBitCast(OpVT, ExtResult, DAG); } return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG); } if (OpVT != MVT::f16 && OpVT != MVT::bf16) return SDValue(); // Bitcasts between f16 and bf16 are legal. if (ArgVT == MVT::f16 || ArgVT == MVT::bf16) return Op; assert(ArgVT == MVT::i16); SDLoc DL(Op); Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op); } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } // Returns lane if Op extracts from a two-element vector and lane is constant // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise. static std::optional getConstantLaneNumOfExtractHalfOperand(SDValue &Op) { SDNode *OpNode = Op.getNode(); if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return std::nullopt; EVT VT = OpNode->getOperand(0).getValueType(); ConstantSDNode *C = dyn_cast(OpNode->getOperand(1)); if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C) return std::nullopt; return C->getZExtValue(); } static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (const SDValue &Elt : N->op_values()) { if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getScalarSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { if (ISD::isExtOpcode(N->getOpcode())) return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); SDLoc dl(N); unsigned EltSize = VT.getScalarSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || isExtendedBUILD_VECTOR(N, DAG, false); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue Chain = Op.getOperand(0); SDValue FPCR_64 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, dl, MVT::i32)); return DAG.getMergeValues({AND, Chain}, dl); } SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = Op->getOperand(0); SDValue RMValue = Op->getOperand(1); // The rounding mode is in bits 23:22 of the FPCR. // The llvm.set.rounding argument value to the rounding mode in FPCR mapping // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is // ((arg - 1) & 3) << 22). // // The argument of llvm.set.rounding must be within the segment [0, 3], so // NearestTiesToAway (4) is not handled here. It is responsibility of the code // generated llvm.set.rounding to ensure this condition. // Calculate new value of FPCR[23:22]. RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, DAG.getConstant(1, DL, MVT::i32)); RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, DAG.getConstant(0x3, DL, MVT::i32)); RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32)); RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue); // Get current value of FPCR. SDValue Ops[] = { Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}; SDValue FPCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops); Chain = FPCR.getValue(1); FPCR = FPCR.getValue(0); // Put new rounding mode into FPSCR[23:22]. const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos); FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR, DAG.getConstant(RMMask, DL, MVT::i64)); FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue); SDValue Ops2[] = { Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA) { bool IsN0SExt = isSignExtended(N0, DAG); bool IsN1SExt = isSignExtended(N1, DAG); if (IsN0SExt && IsN1SExt) return AArch64ISD::SMULL; bool IsN0ZExt = isZeroExtended(N0, DAG); bool IsN1ZExt = isZeroExtended(N1, DAG); if (IsN0ZExt && IsN1ZExt) return AArch64ISD::UMULL; // Select SMULL if we can replace zext with sext. if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) && !isExtendedBUILD_VECTOR(N0, DAG, false) && !isExtendedBUILD_VECTOR(N1, DAG, false)) { SDValue ZextOperand; if (IsN0ZExt) ZextOperand = N0->getOperand(0); else ZextOperand = N1->getOperand(0); if (DAG.SignBitIsZero(ZextOperand)) { SDNode *NewSext = DAG.getSExtOrTrunc(ZextOperand, DL, N0->getValueType(0)).getNode(); if (IsN0ZExt) N0 = NewSext; else N1 = NewSext; return AArch64ISD::SMULL; } } // Select UMULL if we can replace the other operand with an extend. if (IsN0ZExt || IsN1ZExt) { EVT VT = N0->getValueType(0); APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(), VT.getScalarSizeInBits() / 2); if (DAG.MaskedValueIsZero(SDValue(IsN0ZExt ? N1 : N0, 0), Mask)) { EVT HalfVT; switch (VT.getSimpleVT().SimpleTy) { case MVT::v2i64: HalfVT = MVT::v2i32; break; case MVT::v4i32: HalfVT = MVT::v4i16; break; case MVT::v8i16: HalfVT = MVT::v8i8; break; default: return 0; } // Truncate and then extend the result. SDValue NewExt = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, SDValue(IsN0ZExt ? N1 : N0, 0)); NewExt = DAG.getZExtOrTrunc(NewExt, DL, VT); if (IsN0ZExt) N1 = NewExt.getNode(); else N0 = NewExt.getNode(); return AArch64ISD::UMULL; } } if (!IsN1SExt && !IsN1ZExt) return 0; // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (IsN1SExt && isAddSubSExt(N0, DAG)) { IsMLA = true; return AArch64ISD::SMULL; } if (IsN1ZExt && isAddSubZExt(N0, DAG)) { IsMLA = true; return AArch64ISD::UMULL; } if (IsN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); IsMLA = true; return AArch64ISD::UMULL; } return 0; } SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); bool OverrideNEON = !Subtarget->isNeonAvailable(); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so // that VMULL can be detected. Otherwise v2i64 multiplications are not legal. assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); bool isMLA = false; EVT OVT = VT; if (VT.is64BitVector()) { if (N0->getOpcode() == ISD::EXTRACT_SUBVECTOR && isNullConstant(N0->getOperand(1)) && N1->getOpcode() == ISD::EXTRACT_SUBVECTOR && isNullConstant(N1->getOperand(1))) { N0 = N0->getOperand(0).getNode(); N1 = N1->getOperand(0).getNode(); VT = N0->getValueType(0); } else { if (VT == MVT::v1i64) { if (Subtarget->hasSVE()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Fall through to expand this. It is not legal. return SDValue(); } else // Other vector multiplications are legal. return Op; } } SDLoc DL(Op); unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA); if (!NewOpc) { if (VT.getVectorElementType() == MVT::i64) { // If SVE is available then i64 vector multiplications can also be made // legal. if (Subtarget->hasSVE()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Fall through to expand this. It is not legal. return SDValue(); } else // Other vector multiplications are legal. return Op; } // Legalize to a S/UMULL instruction SDValue Op0; SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); if (!isMLA) { Op0 = skipExtensionForVectorMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT, DAG.getNode(NewOpc, DL, VT, Op0, Op1), DAG.getConstant(0, DL, MVT::i64)); } // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during // isel lowering to take advantage of no-stall back to back s/umul + s/umla. // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode( ISD::EXTRACT_SUBVECTOR, DL, OVT, DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)), DAG.getConstant(0, DL, MVT::i64)); } static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern) { if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all) return DAG.getConstant(1, DL, MVT::nxv1i1); return DAG.getNode(AArch64ISD::PTRUE, DL, VT, DAG.getTargetConstant(Pattern, DL, MVT::i32)); } static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsLess, bool IsEqual) { if (!isa(Op.getOperand(1)) || !isa(Op.getOperand(2))) return SDValue(); SDLoc dl(Op); APInt X = Op.getConstantOperandAPInt(1); APInt Y = Op.getConstantOperandAPInt(2); APInt NumActiveElems; bool Overflow; if (IsLess) NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow); else NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow); if (Overflow) return SDValue(); if (IsEqual) { APInt One(NumActiveElems.getBitWidth(), 1, IsSigned); NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow) : NumActiveElems.uadd_ov(One, Overflow); if (Overflow) return SDValue(); } std::optional PredPattern = getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue()); unsigned MinSVEVectorSize = std::max( DAG.getSubtarget().getMinSVEVectorSizeInBits(), 128u); unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements(); if (PredPattern != std::nullopt && NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize)) return getPTrue(DAG, dl, Op.getValueType(), *PredPattern); return SDValue(); } // Returns a safe bitcast between two scalable vector predicates, where // any newly created lanes from a widening bitcast are defined as zero. static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT InVT = Op.getValueType(); assert(InVT.getVectorElementType() == MVT::i1 && VT.getVectorElementType() == MVT::i1 && "Expected a predicate-to-predicate bitcast"); assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && InVT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(InVT) && "Only expect to cast between legal scalable predicate types!"); // Return the operand if the cast isn't changing type, // e.g. -> if (InVT == VT) return Op; SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); // We only have to zero the lanes if new lanes are being defined, e.g. when // casting from to . If this is not the // case (e.g. when casting from -> ) then // we can return here. if (InVT.bitsGT(VT)) return Reinterpret; // Check if the other lanes are already known to be zeroed by // construction. if (isZeroingInactiveLanes(Op)) return Reinterpret; // Zero the newly introduced lanes. SDValue Mask = DAG.getConstant(1, DL, InVT); Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask); return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); } SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, SMEAttrs Attrs, SDLoc DL, EVT VT) const { if (Attrs.hasStreamingInterfaceOrBody()) return DAG.getConstant(1, DL, VT); if (Attrs.hasNonStreamingInterfaceAndBody()) return DAG.getConstant(0, DL, VT); assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface"); SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", getPointerTy(DAG.getDataLayout())); Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); Type *RetTy = StructType::get(Int64Ty, Int64Ty); TargetLowering::CallLoweringInfo CLI(DAG); ArgListTy Args; CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, RetTy, Callee, std::move(Args)); std::pair CallResult = LowerCallTo(CLI); SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), Mask); } static std::optional getCalleeAttrsFromExternalFunction(SDValue V) { if (auto *ES = dyn_cast(V)) { StringRef S(ES->getSymbol()); if (S == "__arm_sme_state" || S == "__arm_tpidr2_save") return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved); if (S == "__arm_tpidr2_restore") return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared); } return std::nullopt; } SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); SDLoc DL(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::aarch64_prefetch: { SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(2); unsigned IsWrite = cast(Op.getOperand(3))->getZExtValue(); unsigned Locality = cast(Op.getOperand(4))->getZExtValue(); unsigned IsStream = cast(Op.getOperand(5))->getZExtValue(); unsigned IsData = cast(Op.getOperand(6))->getZExtValue(); unsigned PrfOp = (IsWrite << 4) | // Load/Store bit (!IsData << 3) | // IsDataCache bit (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); } case Intrinsic::aarch64_sme_za_enable: return DAG.getNode( AArch64ISD::SMSTART, DL, MVT::Other, Op->getOperand(0), // Chain DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); case Intrinsic::aarch64_sme_za_disable: return DAG.getNode( AArch64ISD::SMSTOP, DL, MVT::Other, Op->getOperand(0), // Chain DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); } } SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); SDLoc DL(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::aarch64_mops_memset_tag: { auto Node = cast(Op.getNode()); SDValue Chain = Node->getChain(); SDValue Dst = Op.getOperand(2); SDValue Val = Op.getOperand(3); Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64); SDValue Size = Op.getOperand(4); auto Alignment = Node->getMemOperand()->getAlign(); bool IsVol = Node->isVolatile(); auto DstPtrInfo = Node->getPointerInfo(); const auto &SDI = static_cast(DAG.getSelectionDAGInfo()); SDValue MS = SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val, Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{}); // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise // LowerOperationWrapper will complain that the number of results has // changed. return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); } } } SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::aarch64_neon_abs: { EVT Ty = Op.getValueType(); if (Ty == MVT::i64) { SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op.getOperand(1)); Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); } else { report_fatal_error("Unexpected type for AArch64 NEON intrinic"); } } case Intrinsic::aarch64_neon_pmull64: { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); std::optional LHSLane = getConstantLaneNumOfExtractHalfOperand(LHS); std::optional RHSLane = getConstantLaneNumOfExtractHalfOperand(RHS); assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1"); assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1"); // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2 // instructions execute on SIMD registers. So canonicalize i64 to v1i64, // which ISel recognizes better. For example, generate a ldr into d* // registers as opposed to a GPR load followed by a fmov. auto TryVectorizeOperand = [](SDValue N, std::optional NLane, std::optional OtherLane, const SDLoc &dl, SelectionDAG &DAG) -> SDValue { // If the operand is an higher half itself, rewrite it to // extract_high_v2i64; this way aarch64_neon_pmull64 could // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}. if (NLane && *NLane == 1) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, N.getOperand(0), DAG.getConstant(1, dl, MVT::i64)); // Operand N is not a higher half but the other operand is. if (OtherLane && *OtherLane == 1) { // If this operand is a lower half, rewrite it to // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to // align lanes of two operands. A roundtrip sequence (to move from lane // 1 to lane 0) is like this: // mov x8, v0.d[1] // fmov d0, x8 if (NLane && *NLane == 0) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64, N.getOperand(0), DAG.getConstant(0, dl, MVT::i64)), DAG.getConstant(1, dl, MVT::i64)); // Otherwise just dup from main to all lanes. return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N); } // Neither operand is an extract of higher half, so codegen may just use // the non-high version of PMULL instruction. Use v1i64 to represent i64. assert(N.getValueType() == MVT::i64 && "Intrinsic aarch64_neon_pmull64 requires i64 parameters"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N); }; LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG); RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG); return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umax: return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_smin: return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umin: return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_scalar_sqxtn: case Intrinsic::aarch64_neon_scalar_sqxtun: case Intrinsic::aarch64_neon_scalar_uqxtn: { assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32); if (Op.getValueType() == MVT::i32) return DAG.getNode(ISD::BITCAST, dl, MVT::i32, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32, Op.getOperand(0), DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(1)))); return SDValue(); } case Intrinsic::aarch64_sve_whilelo: return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, /*IsEqual=*/false); case Intrinsic::aarch64_sve_whilelt: return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, /*IsEqual=*/false); case Intrinsic::aarch64_sve_whilels: return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, /*IsEqual=*/true); case Intrinsic::aarch64_sve_whilele: return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, /*IsEqual=*/true); case Intrinsic::aarch64_sve_whilege: return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, /*IsEqual=*/true); case Intrinsic::aarch64_sve_whilegt: return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, /*IsEqual=*/false); case Intrinsic::aarch64_sve_whilehs: return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, /*IsEqual=*/true); case Intrinsic::aarch64_sve_whilehi: return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, /*IsEqual=*/false); case Intrinsic::aarch64_sve_sunpkhi: return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_sunpklo: return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_uunpkhi: return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_uunpklo: return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_clasta_n: return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::aarch64_sve_clastb_n: return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::aarch64_sve_lasta: return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_lastb: return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_rev: return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_tbl: return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_trn1: return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_trn2: return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_uzp1: return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_uzp2: return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_zip1: return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_zip2: return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_splice: return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::aarch64_sve_ptrue: return getPTrue(DAG, dl, Op.getValueType(), cast(Op.getOperand(1))->getZExtValue()); case Intrinsic::aarch64_sve_clz: return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sme_cntsb: return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), DAG.getConstant(1, dl, MVT::i32)); case Intrinsic::aarch64_sme_cntsh: { SDValue One = DAG.getConstant(1, dl, MVT::i32); SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One); return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One); } case Intrinsic::aarch64_sme_cntsw: { SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), DAG.getConstant(1, dl, MVT::i32)); return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, DAG.getConstant(2, dl, MVT::i32)); } case Intrinsic::aarch64_sme_cntsd: { SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), DAG.getConstant(1, dl, MVT::i32)); return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, DAG.getConstant(3, dl, MVT::i32)); } case Intrinsic::aarch64_sve_cnt: { SDValue Data = Op.getOperand(3); // CTPOP only supports integer operands. if (Data.getValueType().isFloatingPoint()) Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Data, Op.getOperand(1)); } case Intrinsic::aarch64_sve_dupq_lane: return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_convert_from_svbool: if (Op.getValueType() == MVT::aarch64svcount) return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1)); return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_convert_to_svbool: if (Op.getOperand(1).getValueType() == MVT::aarch64svcount) return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1)); return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_fneg: return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frintp: return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frintm: return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frinti: return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frintx: return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frinta: return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frintn: return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frintz: return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_ucvtf: return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_scvtf: return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_fcvtzu: return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_fcvtzs: return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_fsqrt: return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frecpx: return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_frecpe_x: return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_frecps_x: return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_frsqrte_x: return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_sve_frsqrts_x: return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_fabs: return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_abs: return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_neg: return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_insr: { SDValue Scalar = Op.getOperand(2); EVT ScalarTy = Scalar.getValueType(); if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), Op.getOperand(1), Scalar); } case Intrinsic::aarch64_sve_rbit: return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_revb: return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_revh: return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_revw: return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_revd: return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxtb: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxth: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxtw: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), Op.getOperand(1)); case Intrinsic::aarch64_sve_uxtb: return DAG.getNode( AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), Op.getOperand(1)); case Intrinsic::aarch64_sve_uxth: return DAG.getNode( AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), Op.getOperand(1)); case Intrinsic::aarch64_sve_uxtw: return DAG.getNode( AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), Op.getOperand(1)); case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); const auto *RegInfo = Subtarget->getRegisterInfo(); unsigned Reg = RegInfo->getLocalAddressRegister(MF); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, Op.getSimpleValueType()); } case Intrinsic::eh_recoverfp: { // FIXME: This needs to be implemented to correctly handle highly aligned // stack objects. For now we simply return the incoming FP. Refer D53541 // for more details. SDValue FnOp = Op.getOperand(1); SDValue IncomingFPOp = Op.getOperand(2); GlobalAddressSDNode *GSD = dyn_cast(FnOp); auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); if (!Fn) report_fatal_error( "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } case Intrinsic::aarch64_neon_vsri: case Intrinsic::aarch64_neon_vsli: { EVT Ty = Op.getValueType(); if (!Ty.isVector()) report_fatal_error("Unexpected type for aarch64_neon_vsli"); assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } case Intrinsic::aarch64_neon_srhadd: case Intrinsic::aarch64_neon_urhadd: case Intrinsic::aarch64_neon_shadd: case Intrinsic::aarch64_neon_uhadd: { bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || IntNo == Intrinsic::aarch64_neon_shadd); bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || IntNo == Intrinsic::aarch64_neon_urhadd); unsigned Opcode = IsSignedAdd ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS) : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU); return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::aarch64_neon_saddlp: case Intrinsic::aarch64_neon_uaddlp: { unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp ? AArch64ISD::UADDLP : AArch64ISD::SADDLP; return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); } case Intrinsic::aarch64_neon_sdot: case Intrinsic::aarch64_neon_udot: case Intrinsic::aarch64_sve_sdot: case Intrinsic::aarch64_sve_udot: { unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || IntNo == Intrinsic::aarch64_sve_udot) ? AArch64ISD::UDOT : AArch64ISD::SDOT; return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } case Intrinsic::get_active_lane_mask: { SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, Op.getOperand(1), Op.getOperand(2)); } } } bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { if (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) { EltTy = MVT::i32; return true; } return false; } bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const { // SVE only supports implicit extension of 32-bit indices. if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32) return false; // Indices cannot be smaller than the main data type. if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits()) return false; // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit // element container type, which would violate the previous clause. return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2; } bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return ExtVal.getValueType().isScalableVector() || Subtarget->useSVEForFixedLengthVectors(); } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { std::map, unsigned> AddrModes = { {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), AArch64ISD::GLD1_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), AArch64ISD::GLD1_UXTW_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), AArch64ISD::GLD1_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), AArch64ISD::GLD1_SXTW_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), AArch64ISD::GLD1_SCALED_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), AArch64ISD::GLD1_SCALED_MERGE_ZERO}, {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, }; auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); return AddrModes.find(Key)->second; } unsigned getSignExtendedGatherOpcode(unsigned Opcode) { switch (Opcode) { default: llvm_unreachable("unimplemented opcode"); return Opcode; case AArch64ISD::GLD1_MERGE_ZERO: return AArch64ISD::GLD1S_MERGE_ZERO; case AArch64ISD::GLD1_IMM_MERGE_ZERO: return AArch64ISD::GLD1S_IMM_MERGE_ZERO; case AArch64ISD::GLD1_UXTW_MERGE_ZERO: return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; case AArch64ISD::GLD1_SXTW_MERGE_ZERO: return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; case AArch64ISD::GLD1_SCALED_MERGE_ZERO: return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; } } SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, SelectionDAG &DAG) const { MaskedGatherSDNode *MGT = cast(Op); SDLoc DL(Op); SDValue Chain = MGT->getChain(); SDValue PassThru = MGT->getPassThru(); SDValue Mask = MGT->getMask(); SDValue BasePtr = MGT->getBasePtr(); SDValue Index = MGT->getIndex(); SDValue Scale = MGT->getScale(); EVT VT = Op.getValueType(); EVT MemVT = MGT->getMemoryVT(); ISD::LoadExtType ExtType = MGT->getExtensionType(); ISD::MemIndexType IndexType = MGT->getIndexType(); // SVE supports zero (and so undef) passthrough values only, everything else // must be handled manually by an explicit select on the load's output. if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) { SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale}; SDValue Load = DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, MGT->getMemOperand(), IndexType, ExtType); SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru); return DAG.getMergeValues({Select, Load.getValue(1)}, DL); } bool IsScaled = MGT->isIndexScaled(); bool IsSigned = MGT->isIndexSigned(); // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else // must be calculated before hand. uint64_t ScaleVal = cast(Scale)->getZExtValue(); if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); EVT IndexVT = Index.getValueType(); Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, MGT->getMemOperand(), IndexType, ExtType); } // Lower fixed length gather to a scalable equivalent. if (VT.isFixedLengthVector()) { assert(Subtarget->useSVEForFixedLengthVectors() && "Cannot lower when not using SVE for fixed vectors!"); // NOTE: Handle floating-point as if integer then bitcast the result. EVT DataVT = VT.changeVectorElementTypeToInteger(); MemVT = MemVT.changeVectorElementTypeToInteger(); // Find the smallest integer fixed length vector we can use for the gather. EVT PromotedVT = VT.changeVectorElementType(MVT::i32); if (DataVT.getVectorElementType() == MVT::i64 || Index.getValueType().getVectorElementType() == MVT::i64 || Mask.getValueType().getVectorElementType() == MVT::i64) PromotedVT = VT.changeVectorElementType(MVT::i64); // Promote vector operands except for passthrough, which we know is either // undef or zero, and thus best constructed directly. unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); // A promoted result type forces the need for an extending load. if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD) ExtType = ISD::EXTLOAD; EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); // Convert fixed length vector operands to scalable. MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); Index = convertToScalableVector(DAG, ContainerVT, Index); Mask = convertFixedMaskToScalableVector(Mask, DAG); PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT) : DAG.getConstant(0, DL, ContainerVT); // Emit equivalent scalable vector gather. SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; SDValue Load = DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL, Ops, MGT->getMemOperand(), IndexType, ExtType); // Extract fixed length data then convert to the required result type. SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load); Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result); if (VT.isFloatingPoint()) Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); return DAG.getMergeValues({Result, Load.getValue(1)}, DL); } // Everything else is legal. return Op; } SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const { MaskedScatterSDNode *MSC = cast(Op); SDLoc DL(Op); SDValue Chain = MSC->getChain(); SDValue StoreVal = MSC->getValue(); SDValue Mask = MSC->getMask(); SDValue BasePtr = MSC->getBasePtr(); SDValue Index = MSC->getIndex(); SDValue Scale = MSC->getScale(); EVT VT = StoreVal.getValueType(); EVT MemVT = MSC->getMemoryVT(); ISD::MemIndexType IndexType = MSC->getIndexType(); bool Truncating = MSC->isTruncatingStore(); bool IsScaled = MSC->isIndexScaled(); bool IsSigned = MSC->isIndexSigned(); // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else // must be calculated before hand. uint64_t ScaleVal = cast(Scale)->getZExtValue(); if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); EVT IndexVT = Index.getValueType(); Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, MSC->getMemOperand(), IndexType, Truncating); } // Lower fixed length scatter to a scalable equivalent. if (VT.isFixedLengthVector()) { assert(Subtarget->useSVEForFixedLengthVectors() && "Cannot lower when not using SVE for fixed vectors!"); // Once bitcast we treat floating-point scatters as if integer. if (VT.isFloatingPoint()) { VT = VT.changeVectorElementTypeToInteger(); MemVT = MemVT.changeVectorElementTypeToInteger(); StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal); } // Find the smallest integer fixed length vector we can use for the scatter. EVT PromotedVT = VT.changeVectorElementType(MVT::i32); if (VT.getVectorElementType() == MVT::i64 || Index.getValueType().getVectorElementType() == MVT::i64 || Mask.getValueType().getVectorElementType() == MVT::i64) PromotedVT = VT.changeVectorElementType(MVT::i64); // Promote vector operands. unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal); // A promoted value type forces the need for a truncating store. if (PromotedVT != VT) Truncating = true; EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); // Convert fixed length vector operands to scalable. MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); Index = convertToScalableVector(DAG, ContainerVT, Index); Mask = convertFixedMaskToScalableVector(Mask, DAG); StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal); // Emit equivalent scalable vector scatter. SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, MSC->getMemOperand(), IndexType, Truncating); } // Everything else is legal. return Op; } SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MaskedLoadSDNode *LoadNode = cast(Op); assert(LoadNode && "Expected custom lowering of a masked load node"); EVT VT = Op->getValueType(0); if (useSVEForFixedLengthVectorVT( VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) return LowerFixedLengthVectorMLoadToSVE(Op, DAG); SDValue PassThru = LoadNode->getPassThru(); SDValue Mask = LoadNode->getMask(); if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) return Op; SDValue Load = DAG.getMaskedLoad( VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(), LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(), LoadNode->getMemOperand(), LoadNode->getAddressingMode(), LoadNode->getExtensionType()); SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru); return DAG.getMergeValues({Result, Load.getValue(1)}, DL); } // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG) { assert(VT.isVector() && "VT should be a vector type"); assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); SDValue Value = ST->getValue(); // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract // the word lane which represent the v4i8 subvector. It optimizes the store // to: // // xtn v0.8b, v0.8h // str s0, [x0] SDValue Undef = DAG.getUNDEF(MVT::i16); SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, {Undef, Undef, Undef, Undef}); SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, UndefVec); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Trunc, DAG.getConstant(0, DL, MVT::i64)); return DAG.getStore(ST->getChain(), DL, ExtractTrunc, ST->getBasePtr(), ST->getMemOperand()); } // Custom lowering for any store, vector or scalar and/or default or with // a truncate operations. Currently only custom lower truncate operation // from vector v4i16 to v4i8 or volatile stores of i128. SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc Dl(Op); StoreSDNode *StoreNode = cast(Op); assert (StoreNode && "Can only custom lower store nodes"); SDValue Value = StoreNode->getValue(); EVT VT = Value.getValueType(); EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { if (useSVEForFixedLengthVectorVT( VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) return LowerFixedLengthVectorStoreToSVE(Op, DAG); unsigned AS = StoreNode->getAddressSpace(); Align Alignment = StoreNode->getAlign(); if (Alignment < MemVT.getStoreSize() && !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, StoreNode->getMemOperand()->getFlags(), nullptr)) { return scalarizeVectorStore(StoreNode, DAG); } if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 && MemVT == MVT::v4i8) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } // 256 bit non-temporal stores can be lowered to STNP. Do this as part of // the custom lowering, as there are no un-paired non-temporal stores and // legalization will break up 256 bit inputs. ElementCount EC = MemVT.getVectorElementCount(); if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && EC.isKnownEven() && ((MemVT.getScalarSizeInBits() == 8u || MemVT.getScalarSizeInBits() == 16u || MemVT.getScalarSizeInBits() == 32u || MemVT.getScalarSizeInBits() == 64u))) { SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), StoreNode->getValue(), DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, StoreNode->getMemoryVT(), StoreNode->getMemOperand()); return Result; } } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { return LowerStore128(Op, DAG); } else if (MemVT == MVT::i64x8) { SDValue Value = StoreNode->getValue(); assert(Value->getValueType(0) == MVT::i64x8); SDValue Chain = StoreNode->getChain(); SDValue Base = StoreNode->getBasePtr(); EVT PtrVT = Base.getValueType(); for (unsigned i = 0; i < 8; i++) { SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value, DAG.getConstant(i, Dl, MVT::i32)); SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, DAG.getConstant(i * 8, Dl, PtrVT)); Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), StoreNode->getOriginalAlign()); } return Chain; } return SDValue(); } /// Lower atomic or volatile 128-bit stores to a single STP instruction. SDValue AArch64TargetLowering::LowerStore128(SDValue Op, SelectionDAG &DAG) const { MemSDNode *StoreNode = cast(Op); assert(StoreNode->getMemoryVT() == MVT::i128); assert(StoreNode->isVolatile() || StoreNode->isAtomic()); bool IsStoreRelease = StoreNode->getMergedOrdering() == AtomicOrdering::Release; if (StoreNode->isAtomic()) assert((Subtarget->hasFeature(AArch64::FeatureLSE2) && Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) || StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); SDValue Value = StoreNode->getOpcode() == ISD::STORE ? StoreNode->getOperand(1) : StoreNode->getOperand(2); SDLoc DL(Op); auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64); unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP; SDValue Result = DAG.getMemIntrinsicNode( Opcode, DL, DAG.getVTList(MVT::Other), {StoreNode->getChain(), StoreValue.first, StoreValue.second, StoreNode->getBasePtr()}, StoreNode->getMemoryVT(), StoreNode->getMemOperand()); return Result; } SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *LoadNode = cast(Op); assert(LoadNode && "Expected custom lowering of a load node"); if (LoadNode->getMemoryVT() == MVT::i64x8) { SmallVector Ops; SDValue Base = LoadNode->getBasePtr(); SDValue Chain = LoadNode->getChain(); EVT PtrVT = Base.getValueType(); for (unsigned i = 0; i < 8; i++) { SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, DAG.getConstant(i * 8, DL, PtrVT)); SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(), LoadNode->getOriginalAlign()); Ops.push_back(Part); Chain = SDValue(Part.getNode(), 1); } SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); return DAG.getMergeValues({Loaded, Chain}, DL); } // Custom lowering for extending v4i8 vector loads. EVT VT = Op->getValueType(0); assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); if (LoadNode->getMemoryVT() != MVT::v4i8) return SDValue(); unsigned ExtType; if (LoadNode->getExtensionType() == ISD::SEXTLOAD) ExtType = ISD::SIGN_EXTEND; else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || LoadNode->getExtensionType() == ISD::EXTLOAD) ExtType = ISD::ZERO_EXTEND; else return SDValue(); SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), LoadNode->getBasePtr(), MachinePointerInfo()); SDValue Chain = Load.getValue(1); SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, DAG.getConstant(0, DL, MVT::i64)); if (VT == MVT::v4i32) Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); return DAG.getMergeValues({Ext, Chain}, DL); } // Generate SUBS and CSEL for integer abs. SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); SDLoc DL(Op); SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op.getOperand(0)); // Generate SUBS & CSEL. SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), Op.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, DAG.getConstant(AArch64CC::PL, DL, MVT::i32), Cmp.getValue(1)); } static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); AArch64CC::CondCode CC; if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { SDLoc dl(Op); SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } return SDValue(); } SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); LLVM_DEBUG(Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("unimplemented operand"); return SDValue(); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::UADDO_CARRY: return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/); case ISD::USUBO_CARRY: return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/); case ISD::SADDO_CARRY: return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/); case ISD::SSUBO_CARRY: return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); case ISD::FSUB: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); case ISD::FMUL: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); case ISD::FMA: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); case ISD::FNEG: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); case ISD::FCEIL: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); case ISD::FFLOOR: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); case ISD::FNEARBYINT: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); case ISD::FRINT: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); case ISD::FROUND: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); case ISD::FROUNDEVEN: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); case ISD::FTRUNC: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); case ISD::FSQRT: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); case ISD::FABS: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::SPONENTRY: return LowerSPONENTRY(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::SPLAT_VECTOR: return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); case ISD::SDIV: case ISD::UDIV: return LowerDIV(Op, DAG); case ISD::SMIN: case ISD::UMIN: case ISD::SMAX: case ISD::UMAX: return LowerMinMax(Op, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerVectorSRA_SRL_SHL(Op, DAG); case ISD::SHL_PARTS: case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftParts(Op, DAG); case ISD::CTPOP: case ISD::PARITY: return LowerCTPOP_PARITY(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::OR: return LowerVectorOR(Op, DAG); case ISD::XOR: return LowerXOR(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::MULHS: return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); case ISD::MULHU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ATOMIC_STORE: if (cast(Op)->getMemoryVT() == MVT::i128) { assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3()); return LowerStore128(Op, DAG); } return SDValue(); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::MSTORE: return LowerFixedLengthVectorMStoreToSVE(Op, DAG); case ISD::MGATHER: return LowerMGATHER(Op, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, DAG); case ISD::VECREDUCE_SEQ_FADD: return LowerVECREDUCE_SEQ_FADD(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAXIMUM: case ISD::VECREDUCE_FMINIMUM: return LowerVECREDUCE(Op, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerATOMIC_LOAD_SUB(Op, DAG); case ISD::ATOMIC_LOAD_AND: return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); case ISD::SIGN_EXTEND_INREG: { // Only custom lower when ExtraVT has a legal byte based element type. EVT ExtraVT = cast(Op.getOperand(1))->getVT(); EVT ExtraEltVT = ExtraVT.getVectorElementType(); if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) return SDValue(); return LowerToPredicatedOp(Op, DAG, AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); } case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::MLOAD: return LowerMLOAD(Op, DAG); case ISD::LOAD: if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerLOAD(Op, DAG); case ISD::ADD: case ISD::AND: case ISD::SUB: return LowerToScalableOp(Op, DAG); case ISD::FMAXIMUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); case ISD::FMAXNUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); case ISD::FMINIMUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED); case ISD::FMINNUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); case ISD::VSELECT: return LowerFixedLengthVectorSelectToSVE(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); case ISD::ABDS: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED); case ISD::ABDU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); case ISD::AVGFLOORS: return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED); case ISD::AVGFLOORU: return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED); case ISD::AVGCEILS: return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED); case ISD::AVGCEILU: return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED); case ISD::BITREVERSE: return LowerBitreverse(Op, DAG); case ISD::BSWAP: return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); case ISD::CTLZ: return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::VECTOR_SPLICE: return LowerVECTOR_SPLICE(Op, DAG); case ISD::VECTOR_DEINTERLEAVE: return LowerVECTOR_DEINTERLEAVE(Op, DAG); case ISD::VECTOR_INTERLEAVE: return LowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STRICT_LROUND: case ISD::STRICT_LLROUND: case ISD::STRICT_LRINT: case ISD::STRICT_LLRINT: { assert(Op.getOperand(1).getValueType() == MVT::f16 && "Expected custom lowering of rounding operations only for f16"); SDLoc DL(Op); SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, {Op.getOperand(0), Op.getOperand(1)}); return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, {Ext.getValue(1), Ext.getValue(0)}); } case ISD::WRITE_REGISTER: { assert(Op.getOperand(2).getValueType() == MVT::i128 && "WRITE_REGISTER custom lowering is only for 128-bit sysregs"); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue SysRegName = Op.getOperand(1); std::pair Pair = DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64); // chain = MSRR(chain, sysregname, lo, hi) SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain, SysRegName, Pair.first, Pair.second); return Result; } } } bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { return !Subtarget->useSVEForFixedLengthVectors(); } bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( EVT VT, bool OverrideNEON) const { if (!VT.isFixedLengthVector() || !VT.isSimple()) return false; // Don't use SVE for vectors we cannot scalarize if required. switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { // Fixed length predicates should be promoted to i8. // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. case MVT::i1: default: return false; case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: case MVT::f16: case MVT::f32: case MVT::f64: break; } // All SVE implementations support NEON sized vectors. if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) return Subtarget->hasSVE(); // Ensure NEON MVTs only belong to a single register class. if (VT.getFixedSizeInBits() <= 128) return false; // Ensure wider than NEON code generation is enabled. if (!Subtarget->useSVEForFixedLengthVectors()) return false; // Don't use SVE for types that don't fit. if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) return false; // TODO: Perhaps an artificial restriction, but worth having whilst getting // the base fixed length SVE support in place. if (!VT.isPow2VectorType()) return false; return true; } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// static unsigned getIntrinsicID(const SDNode *N) { unsigned Opcode = N->getOpcode(); switch (Opcode) { default: return Intrinsic::not_intrinsic; case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); if (IID < Intrinsic::num_intrinsics) return IID; return Intrinsic::not_intrinsic; } } } bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const { if (!N0.hasOneUse()) return false; unsigned IID = getIntrinsicID(N1.getNode()); // Avoid reassociating expressions that can be lowered to smlal/umlal. if (IID == Intrinsic::aarch64_neon_umull || N1.getOpcode() == AArch64ISD::UMULL || IID == Intrinsic::aarch64_neon_smull || N1.getOpcode() == AArch64ISD::SMULL) return N0.getOpcode() != ISD::ADD; return true; } /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { default: report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: case CallingConv::PreserveMost: case CallingConv::PreserveAll: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: case CallingConv::SwiftTail: case CallingConv::Tail: if (Subtarget->isTargetWindows() && IsVarArg) { if (Subtarget->isWindowsArm64EC()) return CC_AArch64_Arm64EC_VarArg; return CC_AArch64_Win64_VarArg; } if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; if (!IsVarArg) return CC_AArch64_DarwinPCS; return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg : CC_AArch64_DarwinPCS_VarArg; case CallingConv::Win64: if (IsVarArg) { if (Subtarget->isWindowsArm64EC()) return CC_AArch64_Arm64EC_VarArg; return CC_AArch64_Win64_VarArg; } return CC_AArch64_AAPCS; case CallingConv::CFGuard_Check: return CC_AArch64_Win64_CFGuard_Check; case CallingConv::AArch64_VectorCall: case CallingConv::AArch64_SVE_VectorCall: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: return CC_AArch64_AAPCS; } } CCAssignFn * AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; } unsigned AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, DAG.getConstant(1, DL, MVT::i32)); SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); Chain = Buffer.getValue(1); MFI.CreateVariableSizedObject(Align(1), nullptr); // Allocate an additional TPIDR2 object on the stack (16 bytes) unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); // Store the buffer pointer to the TPIDR2 stack object. MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); SDValue Ptr = DAG.getFrameIndex( TPIDR2Obj, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); return TPIDR2Obj; } SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); const Function &F = MF.getFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv()); AArch64FunctionInfo *FuncInfo = MF.getInfo(); SmallVector Outs; GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs, DAG.getTargetLoweringInfo(), MF.getDataLayout()); if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); })) FuncInfo->setIsSVECC(true); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; DenseMap CopiedRegs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // At this point, Ins[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeFormalArguments to pass in ValVT and // LocVT. unsigned NumArgs = Ins.size(); Function::const_arg_iterator CurOrigArg = F.arg_begin(); unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; if (Ins[i].isOrigArg()) { std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[i].getOrigArgIndex(); // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; } bool UseVarArgCC = false; if (IsWin64) UseVarArgCC = isVarArg; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } SMEAttrs Attrs(MF.getFunction()); bool IsLocallyStreaming = !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); SDValue Glue = Chain.getValue(1); SmallVector ArgValues; unsigned ExtraArgLocs = 0; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. EVT PtrVT = getPointerTy(DAG.getDataLayout()); int Size = Ins[i].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; // FIXME: This works on big-endian for composite byvals, which are the common // case. It should also work for fundamental types too. unsigned FrameIdx = MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); continue; } if (Ins[i].Flags.isSwiftAsync()) MF.getInfo()->setHasSwiftAsyncContext(true); SDValue ArgValue; if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; else if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; else if (RegVT == MVT::f64 || RegVT.is64BitVector()) RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; else if (RegVT.isScalableVector() && RegVT.getVectorElementType() == MVT::i1) { FuncInfo->setIsSVECC(true); RC = &AArch64::PPRRegClass; } else if (RegVT == MVT::aarch64svcount) { FuncInfo->setIsSVECC(true); RC = &AArch64::PPRRegClass; } else if (RegVT.isScalableVector()) { FuncInfo->setIsSVECC(true); RC = &AArch64::ZPRRegClass; } else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. Register Reg = MF.addLiveIn(VA.getLocReg(), RC); if (IsLocallyStreaming) { // LocallyStreamingFunctions must insert the SMSTART in the correct // position, so we use Glue to ensure no instructions can be scheduled // between the chain of: // t0: ch,glue = EntryNode // t1: res,ch,glue = CopyFromReg // ... // tn: res,ch,glue = CopyFromReg t(n-1), .. // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2 // ^^^^^^ // This will be the new Chain/Root node. ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); Glue = ArgValue.getValue(2); } else ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted // to 64 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::Indirect: assert( (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: break; case CCValAssign::AExtUpper: ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, DAG.getConstant(32, DL, RegVT)); ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); break; } } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect ? VA.getLocVT().getSizeInBits() : VA.getValVT().getSizeInBits()) / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; SDValue FIN; MachinePointerInfo PtrInfo; if (isVarArg && Subtarget->isWindowsArm64EC()) { // In the ARM64EC varargs convention, fixed arguments on the stack are // accessed relative to x4, not sp. unsigned ObjOffset = ArgOffset + BEAlign; Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, DAG.getConstant(ObjOffset, DL, MVT::i64)); PtrInfo = MachinePointerInfo::getUnknownStack(MF); } else { int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); } // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; MVT MemVT = VA.getValVT(); switch (VA.getLocInfo()) { default: break; case CCValAssign::Trunc: case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; case CCValAssign::Indirect: assert((VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); MemVT = VA.getLocVT(); break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; case CCValAssign::ZExt: ExtType = ISD::ZEXTLOAD; break; case CCValAssign::AExt: ExtType = ISD::EXTLOAD; break; } ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo, MemVT); } if (VA.getLocInfo() == CCValAssign::Indirect) { assert((VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); unsigned NumParts = 1; if (Ins[i].Flags.isInConsecutiveRegs()) { assert(!Ins[i].Flags.isInConsecutiveRegsLast()); while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) ++NumParts; } MVT PartLoad = VA.getValVT(); SDValue Ptr = ArgValue; // Ensure we generate all loads for each tuple part, whilst updating the // pointer after each load correctly using vscale. while (NumParts > 0) { ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); InVals.push_back(ArgValue); NumParts--; if (NumParts > 0) { SDValue BytesIncrement; if (PartLoad.isScalableVector()) { BytesIncrement = DAG.getVScale( DL, Ptr.getValueType(), APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); } else { BytesIncrement = DAG.getConstant( APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, Ptr.getValueType()); } SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement, Flags); ExtraArgLocs++; i++; } } } else { if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), ArgValue, DAG.getValueType(MVT::i32)); // i1 arguments are zero-extended to i8 by the caller. Emit a // hint to reflect this. if (Ins[i].isOrigArg()) { Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex()); if (OrigArg->getType()->isIntegerTy(1)) { if (!Ins[i].Flags.isZExt()) { ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL, ArgValue.getValueType(), ArgValue); } } } InVals.push_back(ArgValue); } } assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); // Insert the SMSTART if this is a locally streaming function and // make sure it is Glued to the last CopyFromReg value. if (IsLocallyStreaming) { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); Chain = DAG.getNode( AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), {DAG.getRoot(), DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64), DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue}); // Ensure that the SMSTART happens after the CopyWithChain such that its // chain result is used. for (unsigned I=0; IisTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. // Win64 variadic functions also pass arguments in registers, but all float // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } // This will point to the next argument passed via stack. unsigned VarArgsOffset = CCInfo.getStackSize(); // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8); FuncInfo->setVarArgsStackOffset(VarArgsOffset); FuncInfo->setVarArgsStackIndex( MFI.CreateFixedObject(4, VarArgsOffset, true)); if (MFI.hasMustTailInVarArgFunc()) { SmallVector RegParmTypes; RegParmTypes.push_back(MVT::i64); RegParmTypes.push_back(MVT::f128); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_AArch64_AAPCS); // Conservatively forward X8, since it might be used for aggregate return. if (!CCInfo.isAllocated(AArch64::X8)) { Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); } } } // On Windows, InReg pointers must be returned, so record the pointer in a // virtual register at the start of the function so it can be returned in the // epilogue. if (IsWin64) { for (unsigned I = 0, E = Ins.size(); I != E; ++I) { if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) { assert(!FuncInfo->getSRetReturnReg()); MVT PtrTy = getPointerTy(DAG.getDataLayout()); Register Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); break; } } } unsigned StackArgSize = CCInfo.getStackSize(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: StackArgSize = alignTo(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. FuncInfo->setArgumentStackToRestore(StackArgSize); // This realignment carries over to the available bytes below. Our own // callers will guarantee the space is free by giving an aligned value to // CALLSEQ_START. } // Even if we're not expected to free up the space, it's useful to know how // much is there while considering tail calls (because we can reuse it). FuncInfo->setBytesInStackArgArea(StackArgSize); if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); // Conservatively assume the function requires the lazy-save mechanism. if (SMEAttrs(MF.getFunction()).hasZAState()) { unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); } return Chain; } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); SmallVector MemOps; auto GPRArgRegs = AArch64::getGPRArgRegs(); unsigned NumGPRArgRegs = GPRArgRegs.size(); if (Subtarget->isWindowsArm64EC()) { // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs // functions. NumGPRArgRegs = 4; } unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { if (IsWin64) { GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); if (GPRSaveSize & 15) // The extra size here, if triggered, will always be 8. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); } else GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); SDValue FIN; if (Subtarget->isWindowsArm64EC()) { // With the Arm64EC ABI, we reserve the save area as usual, but we // compute its address relative to x4. For a normal AArch64->AArch64 // call, x4 == sp on entry, but calls from an entry thunk can pass in a // different address. Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val, DAG.getConstant(GPRSaveSize, DL, MVT::i64)); } else { FIN = DAG.getFrameIndex(GPRIdx, PtrVT); } for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, IsWin64 ? MachinePointerInfo::getFixedStack( MF, GPRIdx, (i - FirstVariadicGPR) * 8) : MachinePointerInfo::getStack(MF, i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); if (Subtarget->hasFPARMv8() && !IsWin64) { auto FPRArgRegs = AArch64::getFPRArgRegs(); const unsigned NumFPRArgRegs = FPRArgRegs.size(); unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(MF, i * 16)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); FuncInfo->setVarArgsFPRSize(FPRSaveSize); } if (!MemOps.empty()) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { DenseMap CopiedRegs; // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } // Avoid copying a physreg twice since RegAllocFast is incompetent and only // allows one use of a physreg per block. SDValue Val = CopiedRegs.lookup(VA.getLocReg()); if (!Val) { Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); Chain = Val.getValue(1); InGlue = Val.getValue(2); CopiedRegs[VA.getLocReg()] = Val; } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; case CCValAssign::AExtUpper: Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, DAG.getConstant(32, DL, VA.getLocVT())); [[fallthrough]]; case CCValAssign::AExt: [[fallthrough]]; case CCValAssign::ZExt: Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); break; } InVals.push_back(Val); } return Chain; } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { return (CC == CallingConv::Fast && GuaranteeTailCalls) || CC == CallingConv::Tail || CC == CallingConv::SwiftTail; } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { case CallingConv::C: case CallingConv::AArch64_SVE_VectorCall: case CallingConv::PreserveMost: case CallingConv::PreserveAll: case CallingConv::Swift: case CallingConv::SwiftTail: case CallingConv::Tail: case CallingConv::Fast: return true; default: return false; } } static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo) { const SelectionDAG &DAG = CLI.DAG; CallingConv::ID CalleeCC = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; const SmallVector &Outs = CLI.Outs; bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; bool UseVarArgCC = false; if (IsVarArg) { // On Windows, the fixed arguments in a vararg call are passed in GPRs // too, so use the vararg CC to force them to integer registers. if (IsCalleeWin64) { UseVarArgCC = true; } else { UseVarArgCC = !Outs[i].IsFixed; } } if (!UseVarArgCC) { // Get type of the original argument. EVT ActualVT = TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty, /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ArgVT = MVT::i8; else if (ActualMVT == MVT::i16) ArgVT = MVT::i16; } CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC); bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } bool AArch64TargetLowering::isEligibleForTailCallOptimization( const CallLoweringInfo &CLI) const { CallingConv::ID CalleeCC = CLI.CallConv; if (!mayTailCallThisCC(CalleeCC)) return false; SDValue Callee = CLI.Callee; bool IsVarArg = CLI.IsVarArg; const SmallVector &Outs = CLI.Outs; const SmallVector &OutVals = CLI.OutVals; const SmallVector &Ins = CLI.Ins; const SelectionDAG &DAG = CLI.DAG; MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. SMEAttrs CallerAttrs(MF.getFunction()); auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); if (CallerAttrs.requiresSMChange(CalleeAttrs) || CallerAttrs.requiresLazySave(CalleeAttrs)) return false; // Functions using the C or Fast calling convention that have an SVE signature // preserve more registers and should assume the SVE_VectorCall CC. // The check for matching callee-saved regs will determine whether it is // eligible for TCO. if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && MF.getInfo()->isSVECC()) CallerCC = CallingConv::AArch64_SVE_VectorCall; bool CCMatch = CallerCC == CalleeCC; // When using the Windows calling convention on a non-windows OS, we want // to back up and restore X18 in such functions; we can't do a tail call // from those functions. if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && CalleeCC != CallingConv::Win64) return false; // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. for (Function::const_arg_iterator i = CallerF.arg_begin(), e = CallerF.arg_end(); i != e; ++i) { if (i->hasByValAttr()) return false; // On Windows, "inreg" attributes signify non-aggregate indirect returns. // In this case, it is necessary to save/restore X0 in the callee. Tail // call opt interferes with this. So we disable tail call opt when the // caller has an argument with "inreg" attribute. // FIXME: Check whether the callee also has an "inreg" argument. if (i->hasInRegAttr()) return false; } if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) return CCMatch; // Externally-defined functions with weak linkage should not be // tail-called on AArch64 when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // Now we search for cases where we can use a tail call without changing the // ABI. Sibcall is used in some places (particularly gcc) to refer to this // concept. // I want anyone implementing a new calling convention to think long and hard // about this assert. assert((!IsVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); LLVMContext &C = *DAG.getContext(); // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, CCAssignFnForCall(CalleeCC, IsVarArg), CCAssignFnForCall(CallerCC, IsVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (Subtarget->hasCustomCallingConv()) { TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); } if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } // Nothing more to check if the callee is taking no arguments if (Outs.empty()) return true; SmallVector ArgLocs; CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C); analyzeCallOperands(*this, Subtarget, CLI, CCInfo); if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) { // When we are musttail, additional checks have been done and we can safely ignore this check // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If // caller is C then we could potentially use its argument area. // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. for (const CCValAssign &ArgLoc : ArgLocs) if (!ArgLoc.isRegLoc()) return false; } const AArch64FunctionInfo *FuncInfo = MF.getInfo(); // If any of the arguments is passed indirectly, it must be SVE, so the // 'getBytesInStackArgArea' is not sufficient to determine whether we need to // allocate space on the stack. That is why we determine this explicitly here // the call cannot be a tailcall. if (llvm::any_of(ArgLocs, [&](CCValAssign &A) { assert((A.getLocInfo() != CCValAssign::Indirect || A.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) && "Expected value to be scalable"); return A.getLocInfo() == CCValAssign::Indirect; })) return false; // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) return false; const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; return true; } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const { SmallVector ArgChains; int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the // CALLSEQ_BEGIN node. ArgChains.push_back(Chain); // Add a chain value for each stack argument corresponding for (SDNode *U : DAG.getEntryNode().getNode()->uses()) if (LoadSDNode *L = dyn_cast(U)) if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) if (FI->getIndex() < 0) { int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); int64_t InLastByte = InFirstByte; InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || (FirstByte <= InFirstByte && InFirstByte <= LastByte)) ArgChains.push_back(SDValue(L, 1)); } // Build a tokenfactor for all the chains. return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); } bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { return (CallCC == CallingConv::Fast && TailCallOpt) || CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; } // Check if the value is zero-extended from i1 to i8 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { unsigned SizeInBits = Arg.getValueType().getSizeInBits(); if (SizeInBits < 8) return false; APInt RequredZero(SizeInBits, 0xFE); KnownBits Bits = DAG.computeKnownBits(Arg, 4); bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; return ZExtBool; } SDValue AArch64TargetLowering::changeStreamingMode( SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); SDValue MSROp = DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32); SDValue ExpectedSMVal = DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64); SmallVector Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask}; if (InGlue) Ops.push_back(InGlue); unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP; return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SmallVector &Outs = CLI.Outs; SmallVector &OutVals = CLI.OutVals; SmallVector &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); MachineFunction::CallSiteInfo CSInfo; bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType; bool IsSibCall = false; bool GuardWithBTI = false; if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) && !Subtarget->noBTIAtReturnTwice()) { GuardWithBTI = FuncInfo->branchTargetEnforcement(); } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); if (IsVarArg) { unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); } } analyzeCallOperands(*this, Subtarget, CLI, CCInfo); CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); // Assign locations to each value returned by this call. SmallVector RVLocs; CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC); // Check callee args/returns for SVE registers and set calling convention // accordingly. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { auto HasSVERegLoc = [](CCValAssign &Loc) { if (!Loc.isRegLoc()) return false; return AArch64::ZPRRegClass.contains(Loc.getLocReg()) || AArch64::PPRRegClass.contains(Loc.getLocReg()); }; if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc)) CallConv = CallingConv::AArch64_SVE_VectorCall; } if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization(CLI); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) IsSibCall = true; if (IsTailCall) ++NumTailCalls; } if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getStackSize(); if (IsSibCall) { // Since we're not changing the ABI to make this a tail call, the memory // operands are already available in the caller's incoming argument space. NumBytes = 0; } // FPDiff is the byte offset of the call's argument area from the callee's. // Stores to callee stack arguments will be placed in FixedStackSlots offset // by this amount for a tail call. In a sibling call it must be 0 because the // caller will deallocate the entire stack and the callee still expects its // arguments to begin at SP+0. Completely unused for non-tail calls. int FPDiff = 0; if (IsTailCall && !IsSibCall) { unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. NumBytes = alignTo(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we // can actually shrink the stack. FPDiff = NumReusableBytes - NumBytes; // Update the required reserved area if this is the tail call requiring the // most argument stack space. if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) FuncInfo->setTailCallReservedStack(-FPDiff); // The stack pointer must be 16-byte aligned at all times it's used for a // memory operation, which in practice means at *all* times and in // particular across call boundaries. Therefore our own arguments started at // a 16-byte aligned SP and the delta applied for the tail call should // satisfy the same constraint. assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } // Determine whether we need any streaming mode changes. SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); if (CLI.CB) CalleeAttrs = SMEAttrs(*CLI.CB); else if (std::optional Attrs = getCalleeAttrsFromExternalFunction(CLI.Callee)) CalleeAttrs = *Attrs; bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); MachineFrameInfo &MFI = MF.getFrameInfo(); if (RequiresLazySave) { // Set up a lazy save mechanism by storing the runtime live slices // (worst-case N*N) to the TPIDR2 stack object. SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, DAG.getConstant(1, DL, MVT::i32)); SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue BufferPtrAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), TPIDR2ObjAddr); } SDValue PStateSM; std::optional RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs); if (RequiresSMChange) PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); SmallVector, 8> RegsToPass; SmallSet RegsUsed; SmallVector MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); RegsToPass.emplace_back(F.PReg, Val); } } // Walk the register/memloc assignments, inserting copies/loads. unsigned ExtraArgLocs = 0; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: if (Outs[i].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to 8-bits by the caller. // // Check if we actually have to do this, because the value may // already be zero-extended. // // We cannot just emit a (zext i8 (trunc (assert-zext i8))) // and rely on DAGCombiner to fold this, because the following // (anyext i32) is combined with (zext i8) in DAG.getNode: // // (ext (zext x)) -> (zext x) // // This will give us (zext i32), which we cannot remove, so // try to check this beforehand. if (!checkZExtBool(Arg, DAG)) { Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); } } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExtUpper: assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, DAG.getConstant(32, DL, VA.getLocVT())); break; case CCValAssign::BCvt: Arg = DAG.getBitcast(VA.getLocVT(), Arg); break; case CCValAssign::Trunc: Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::Indirect: bool isScalable = VA.getValVT().isScalableVT(); assert((isScalable || Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue(); uint64_t PartSize = StoreSize; unsigned NumParts = 1; if (Outs[i].Flags.isInConsecutiveRegs()) { assert(!Outs[i].Flags.isInConsecutiveRegsLast()); while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) ++NumParts; StoreSize *= NumParts; } Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); int FI = MFI.CreateStackObject(StoreSize, Alignment, false); if (isScalable) MFI.setStackID(FI, TargetStackID::ScalableVector); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); SDValue Ptr = DAG.getFrameIndex( FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue SpillSlot = Ptr; // Ensure we generate all stores for each tuple part, whilst updating the // pointer after each store correctly using vscale. while (NumParts) { Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); NumParts--; if (NumParts > 0) { SDValue BytesIncrement; if (isScalable) { BytesIncrement = DAG.getVScale( DL, Ptr.getValueType(), APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); } else { BytesIncrement = DAG.getConstant( APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, Ptr.getValueType()); } SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); MPI = MachinePointerInfo(MPI.getAddrSpace()); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement, Flags); ExtraArgLocs++; i++; } } Arg = SpillSlot; break; } if (VA.isRegLoc()) { if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i64) { assert(VA.getLocVT() == MVT::i64 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i64 && "unexpected use of 'returned'"); IsThisReturn = true; } if (RegsUsed.count(VA.getLocReg())) { // If this register has already been used then we're trying to pack // parts of an [N x i32] into an X-register. The extension type will // take care of putting the two halves in the right place but we have to // combine them. SDValue &Bits = llvm::find_if(RegsToPass, [=](const std::pair &Elt) { return Elt.first == VA.getLocReg(); }) ->second; Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); // Call site info is used for function's parameter entry value // tracking. For now we track only simple cases when parameter // is transferred through whole register. llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { return ArgReg.Reg == VA.getLocReg(); }); } else { // Add an extra level of indirection for streaming mode changes by // using a pseudo copy node that cannot be rematerialised between a // smstart/smstop and the call by the simple register coalescer. if (RequiresSMChange && isa(Arg)) Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg); RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); } } else { assert(VA.isMemLoc()); SDValue DstAddr; MachinePointerInfo DstInfo; // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; unsigned OpSize; if (VA.getLocInfo() == CCValAssign::Indirect || VA.getLocInfo() == CCValAssign::Trunc) OpSize = VA.getLocVT().getFixedSizeInBits(); else OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); if (IsTailCall) { Offset = Offset + FPDiff; int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(MF, FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be // clobbered. Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); } else { SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); } if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getNonZeroByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already // promoted to a legal register type i32, we should truncate Arg back to // i1/i8/i16. if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); MemOpChains.push_back(Store); } } } if (IsVarArg && Subtarget->isWindowsArm64EC()) { // For vararg calls, the Arm64EC ABI requires values in x4 and x5 // describing the argument list. x4 contains the address of the // first stack parameter. x5 contains the size in bytes of all parameters // passed on the stack. RegsToPass.emplace_back(AArch64::X4, StackPtr); RegsToPass.emplace_back(AArch64::X5, DAG.getConstant(NumBytes, DL, MVT::i64)); } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); SDValue InGlue; if (RequiresSMChange) { SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain, InGlue, PStateSM, true); Chain = NewChain.getValue(0); InGlue = NewChain.getValue(1); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. for (auto &RegToPass : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InGlue); InGlue = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. if (auto *G = dyn_cast(Callee)) { auto GV = G->getGlobal(); unsigned OpFlags = Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); if (OpFlags & AArch64II::MO_GOT) { Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else { const GlobalValue *GV = G->getGlobal(); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); } } else if (auto *S = dyn_cast(Callee)) { if (getTargetMachine().getCodeModel() == CodeModel::Large && Subtarget->isTargetMachO()) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } } // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL); InGlue = Chain.getValue(1); } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual // consumption by emitEpilogue. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable Mask = TRI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { IsThisReturn = false; Mask = TRI->getCallPreservedMask(MF, CallConv); } } else Mask = TRI->getCallPreservedMask(MF, CallConv); if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(MF, &Mask); if (TRI->isAnyArgRegReserved(MF)) TRI->emitReservedArgRegCallError(MF); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InGlue.getNode()) Ops.push_back(InGlue); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); if (IsCFICall) Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; } unsigned CallOpc = AArch64ISD::CALL; // Calls with operand bundle "clang.arc.attachedcall" are special. They should // be expanded to the call, directly followed by a special marker sequence and // a call to an ObjC library function. Use CALL_RVMARKER to do that. if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { assert(!IsTailCall && "tail calls cannot be marked with clang.arc.attachedcall"); CallOpc = AArch64ISD::CALL_RVMARKER; // Add a target global address for the retainRV/claimRV runtime function // just before the call target. Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT); Ops.insert(Ops.begin() + 1, GA); } else if (GuardWithBTI) CallOpc = AArch64ISD::CALL_BTI; // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); if (IsCFICall) Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InGlue = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL); InGlue = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); if (!Ins.empty()) InGlue = Result.getValue(Result->getNumValues() - 1); if (RequiresSMChange) { assert(PStateSM && "Expected a PStateSM to be set"); Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InGlue, PStateSM, false); } if (RequiresLazySave) { // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, MVT::Other, Result, DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); // Conditionally restore the lazy save using a pseudo node. unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); // Copy the address of the TPIDR2 block into X0 before 'calling' the // RESTORE_ZA pseudo. SDValue Glue; SDValue TPIDR2Block = DAG.getFrameIndex( FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), RestoreRoutine, RegMask, Result.getValue(1)}); // Finally reset the TPIDR2_EL0 register to 0. Result = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Result, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); } if (RequiresSMChange || RequiresLazySave) { for (unsigned I = 0; I < InVals.size(); ++I) { // The smstart/smstop is chained as part of the call, but when the // resulting chain is discarded (which happens when the call is not part // of a chain, e.g. a call to @llvm.cos()), we need to ensure the // smstart/smstop is chained to the result value. We can do that by doing // a vreg -> vreg copy. Register Reg = MF.getRegInfo().createVirtualRegister( getRegClassFor(InVals[I].getValueType().getSimpleVT())); SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]); InVals[I] = DAG.getCopyFromReg(X, DL, Reg, InVals[I].getValueType()); } } return Result; } bool AArch64TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } SDValue AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { auto &MF = DAG.getMachineFunction(); auto *FuncInfo = MF.getInfo(); CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC); // Copy the result values into the output registers. SDValue Glue; SmallVector, 4> RetVals; SmallSet RegsUsed; for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: if (Outs[i].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to i8 by the producer of the // value. This is strictly redundant on Darwin (which uses "zeroext // i1"), but will be optimised out before ISel. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); } break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: case CCValAssign::ZExt: Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); break; case CCValAssign::AExtUpper: assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, DAG.getConstant(32, DL, VA.getLocVT())); break; } if (RegsUsed.count(VA.getLocReg())) { SDValue &Bits = llvm::find_if(RetVals, [=](const std::pair &Elt) { return Elt.first == VA.getLocReg(); })->second; Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); } else { RetVals.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); } } const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); // Emit SMSTOP before returning from a locally streaming function SMEAttrs FuncAttrs(MF.getFunction()); if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { Chain = DAG.getNode( AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask())); Glue = Chain.getValue(1); } SmallVector RetOps(1, Chain); for (auto &RetVal : RetVals) { Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue); Glue = Chain.getValue(1); RetOps.push_back( DAG.getRegister(RetVal.first, RetVal.second.getValueType())); } // Windows AArch64 ABIs require that for returning structs by value we copy // the sret argument into X0 for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into X0. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = AArch64::X0; Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue); Glue = Chain.getValue(1); RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); if (I) { for (; *I; ++I) { if (AArch64::GPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else if (AArch64::FPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the glue if we have it. if (Glue.getNode()) RetOps.push_back(Glue); return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps); } //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, N->getOffset(), Flag); } SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); } SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), N->getOffset(), Flag); } SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); } // (loadGOT sym) template SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); } // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) template SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, Ty, getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); } // (addlow (adrp %hi(sym)) %lo(sym)) template SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); SDValue Lo = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); } // (adr sym) template SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Sym = getTargetNode(N, Ty, DAG, Flags); return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); } SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); if (OpFlags != AArch64II::MO_NO_FLAG) assert(cast(Op)->getOffset() == 0 && "unexpected offset in global node"); // This also catches the large code model case for Darwin, and tiny code // model with got relocations. if ((OpFlags & AArch64II::MO_GOT) != 0) { return getGOT(GN, DAG, OpFlags); } SDValue Result; if (getTargetMachine().getCodeModel() == CodeModel::Large) { Result = getAddrLarge(GN, DAG, OpFlags); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { Result = getAddrTiny(GN, DAG, OpFlags); } else { Result = getAddr(GN, DAG, OpFlags); } EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(GN); if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX | AArch64II::MO_COFFSTUB)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } /// Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address (for Darwin, currently) and /// return an SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i64] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first xword, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "x0". /// /// Since this descriptor may be in a different unit, in general even the /// descriptor must be accessed via an indirect load. The "ideal" code sequence /// is: /// adrp x0, _var@TLVPPAGE /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, /// ; the function pointer /// blr x1 ; Uses descriptor address in x0 /// ; Address of _var is now in x0. /// /// If the address of _var's descriptor *is* known to the linker, then it can /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for /// a slight efficiency gain. SDValue AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "This function expects a Darwin target"); SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); const GlobalValue *GV = cast(Op)->getGlobal(); SDValue TLVPAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( PtrMemVT, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(PtrMemVT.getSizeInBits() / 8), MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = TRI->getTLSCallPreservedMask(); if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); } /// Convert a thread-local variable reference into a sequence of instructions to /// compute the variable's address for the local exec TLS model of ELF targets. /// The sequence depends on the maximum TLS area size. SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, SDValue ThreadBase, const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue TPOff, Addr; switch (DAG.getTarget().Options.TLSSize) { default: llvm_unreachable("Unexpected TLS size"); case 12: { // mrs x0, TPIDR_EL0 // add x0, x0, :tprel_lo12:a SDValue Var = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, Var, DAG.getTargetConstant(0, DL, MVT::i32)), 0); } case 24: { // mrs x0, TPIDR_EL0 // add x0, x0, :tprel_hi12:a // add x0, x0, :tprel_lo12_nc:a SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); } case 32: { // mrs x1, TPIDR_EL0 // movz x0, #:tprel_g1:a // movk x0, #:tprel_g0_nc:a // add x0, x1, x0 SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, DAG.getTargetConstant(16, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); } case 48: { // mrs x1, TPIDR_EL0 // movz x0, #:tprel_g2:a // movk x0, #:tprel_g1_nc:a // movk x0, #:tprel_g0_nc:a // add x0, x1, x0 SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); SDValue MiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, DAG.getTargetConstant(32, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, DAG.getTargetConstant(16, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); } } } /// When accessing thread-local variables under either the general-dynamic or /// local-dynamic system, we make a "TLS-descriptor" call. The variable will /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry /// is a function pointer to carry out the resolution. /// /// The sequence is: /// adrp x0, :tlsdesc:var /// ldr x1, [x0, #:tlsdesc_lo12:var] /// add x0, x0, #:tlsdesc_lo12:var /// .tlsdesccall var /// blr x1 /// (TPIDR_EL0 offset now in x0) /// /// The above sequence must be produced unscheduled, to enable the linker to /// optimize/relax this sequence. /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); } SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); const GlobalAddressSDNode *GA = cast(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; } if (getTargetMachine().getCodeModel() == CodeModel::Large && Model != TLSModel::LocalExec) report_fatal_error("ELF TLS only supported in small memory model or " "in local exec TLS model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the // maximum TLS size is 4GiB. // FIXME: add tiny and large code model support for TLS access models other // than local exec. We currently generate the same code as small for tiny, // which may be larger than needed. SDValue TPOff; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); if (Model == TLSModel::LocalExec) { return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); } else if (Model == TLSModel::InitialExec) { TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); } else if (Model == TLSModel::LocalDynamic) { // Local-dynamic accesses proceed in two phases. A general-dynamic TLS // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate // the beginning of the module's TLS region, followed by a DTPREL offset // calculation. // These accesses will need deduplicating if there's more than one. AArch64FunctionInfo *MFI = DAG.getMachineFunction().getInfo(); MFI->incNumLocalDynamicTLSAccesses(); // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS); // Now we can calculate the offset from TPIDR_EL0 to this module's // thread-local area. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); // Now use :dtprel_whatever: operations to calculate this variable's offset // in its thread-storage area. SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); } else if (Model == TLSModel::GeneralDynamic) { // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); // Finally we can make a call to calculate the offset from tpidr_el0. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); } else llvm_unreachable("Unsupported ELF TLS access model"); return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); } SDValue AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); // Load the ThreadLocalStoragePointer from the TEB // A pointer to the TLS array is located at offset 0x58 from the TEB. SDValue TLSArray = DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); Chain = TLSArray.getValue(1); // Load the TLS index from the C runtime; // This does the same as getAddr(), but without having a GlobalAddressSDNode. // This also does the same as LOADgot, but using a generic i32 load, // while LOADgot only loads i64. SDValue TLSIndexHi = DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); SDValue TLSIndexLo = DAG.getTargetExternalSymbol( "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); SDValue TLSIndex = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); Chain = TLSIndex.getValue(1); // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 // offset into the TLSArray. TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(3, DL, PtrVT)); SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), MachinePointerInfo()); Chain = TLS.getValue(1); const GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); SDValue TGAHi = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue TGALo = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); // Add the offset from the start of the .tls section (section base). SDValue Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, DAG.getTargetConstant(0, DL, MVT::i32)), 0); Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); return Addr; } SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetWindows()) return LowerWindowsGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); } // Looks through \param Val to determine the bit that can be used to // check the sign of the value. It returns the unextended value and // the sign bit position. std::pair lookThroughSignExtension(SDValue Val) { if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) return {Val.getOperand(0), cast(Val.getOperand(1))->getVT().getFixedSizeInBits() - 1}; if (Val.getOpcode() == ISD::SIGN_EXTEND) return {Val.getOperand(0), Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; return {Val, Val.getValueSizeInBits() - 1}; } SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); MachineFunction &MF = DAG.getMachineFunction(); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions // will not be produced, as they are conditional branch instructions that do // not set flags. bool ProduceNonFlagSettingCondBr = !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); // Handle f128 first, since lowering it will result in comparing the return // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); // The actual operation with overflow check. AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Overflow); } if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); // If the RHS of the comparison is zero, we can potentially fold this // to a specialized branch. const ConstantSDNode *RHSC = dyn_cast(RHS); if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { if (CC == ISD::SETEQ) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETNE) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t SignBitPos; std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t SignBitPos; std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, Cmp); } return BR1; } SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasNEON()) return SDValue(); EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); if (!SrcVT.bitsEq(VT)) In2 = DAG.getFPExtendOrRound(In2, DL, VT); if (VT.isScalableVector()) IntVT = getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); if (VT.isFixedLengthVector() && useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); In1 = convertToScalableVector(DAG, ContainerVT, In1); In2 = convertToScalableVector(DAG, ContainerVT, In2); SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2); return convertFromScalableVector(DAG, VT, Res); } auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { if (VT.isScalableVector()) return getSVESafeBitCast(VT, Op, DAG); return DAG.getBitcast(VT, Op); }; SDValue VecVal1, VecVal2; EVT VecVT; auto SetVecVal = [&](int Idx = -1) { if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = BitCast(VecVT, In1, DAG); VecVal2 = BitCast(VecVT, In2, DAG); } }; if (VT.isVector()) { VecVT = IntVT; SetVecVal(); } else if (VT == MVT::f64) { VecVT = MVT::v2i64; SetVecVal(AArch64::dsub); } else if (VT == MVT::f32) { VecVT = MVT::v4i32; SetVecVal(AArch64::ssub); } else if (VT == MVT::f16) { VecVT = MVT::v8i16; SetVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } unsigned BitWidth = In1.getScalarValueSizeInBits(); SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT); // We want to materialize a mask with every bit but the high bit set, but the // AdvSIMD immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize all bits set and then negate that. if (VT == MVT::f64 || VT == MVT::v2f64) { SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT); SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV); SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV); SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV); } SDValue BSP = DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); if (VT == MVT::f16) return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); if (VT == MVT::f64) return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); return BitCast(VT, BSP, DAG); } SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const { if (DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) return SDValue(); bool IsParity = Op.getOpcode() == ISD::PARITY; SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); // for i32, general parity function using EORs is more efficient compared to // using floating point if (VT == MVT::i32 && IsParity) return SDValue(); // If there is no CNT instruction available, GPR popcount can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from // the AdvSIMD registers are cheap. // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd // CNT V0.8B, V0.8B // 8xbyte pop-counts // ADDV B0, V0.8B // sum 8xbyte pop-counts // UMOV X0, V0.B[0] // copy byte result back to integer reg if (VT == MVT::i32 || VT == MVT::i64) { if (VT == MVT::i32) Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); if (IsParity) UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, DAG.getConstant(1, DL, MVT::i32)); if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; } else if (VT == MVT::i128) { Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); if (IsParity) UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, DAG.getConstant(1, DL, MVT::i32)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } assert(!IsParity && "ISD::PARITY of vector types not supported"); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && "Unexpected type for custom ctpop lowering"); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; Val = DAG.getBitcast(VT8Bit, Val); Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. unsigned EltSize = 8; unsigned NumElts = VT.is64BitVector() ? 8 : 16; while (EltSize != VT.getScalarSizeInBits()) { EltSize *= 2; NumElts /= 2; MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); Val = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); } return Val; } SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isScalableVector() || useSVEForFixedLengthVectorVT( VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())); SDLoc DL(Op); SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); } SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Opcode = Op.getOpcode(); ISD::CondCode CC; switch (Opcode) { default: llvm_unreachable("Wrong instruction"); case ISD::SMAX: CC = ISD::SETGT; break; case ISD::SMIN: CC = ISD::SETLT; break; case ISD::UMAX: CC = ISD::SETUGT; break; case ISD::UMIN: CC = ISD::SETULT; break; } if (VT.isScalableVector() || useSVEForFixedLengthVectorVT( VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { switch (Opcode) { default: llvm_unreachable("Wrong instruction"); case ISD::SMAX: return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); case ISD::SMIN: return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); case ISD::UMAX: return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); case ISD::UMIN: return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); } } SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); return DAG.getSelect(DL, VT, Cond, Op0, Op1); } SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isScalableVector() || useSVEForFixedLengthVectorVT( VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); SDLoc DL(Op); SDValue REVB; MVT VST; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid type for bitreverse!"); case MVT::v2i32: { VST = MVT::v8i8; REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); break; } case MVT::v4i32: { VST = MVT::v16i8; REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); break; } case MVT::v1i64: { VST = MVT::v8i8; REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); break; } case MVT::v2i64: { VST = MVT::v16i8; REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); break; } } return DAG.getNode(AArch64ISD::NVCAST, DL, VT, DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); } // Check whether the continuous comparison sequence. static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector, 16> &WorkList) { if (Num == MaxXors) return false; // Skip the one-use zext if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse()) N = N->getOperand(0); // The leaf node must be XOR if (N->getOpcode() == ISD::XOR) { WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); Num++; return true; } // All the non-leaf nodes must be OR. if (N->getOpcode() != ISD::OR || !N->hasOneUse()) return false; if (isOrXorChain(N->getOperand(0), Num, WorkList) && isOrXorChain(N->getOperand(1), Num, WorkList)) return true; return false; } // Transform chains of ORs and XORs, which usually outlined by memcmp/bmp. static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDLoc DL(N); EVT VT = N->getValueType(0); SmallVector, 16> WorkList; // Only handle integer compares. if (N->getOpcode() != ISD::SETCC) return SDValue(); ISD::CondCode Cond = cast(N->getOperand(2))->get(); // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag unsigned NumXors = 0; if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && LHS->getOpcode() == ISD::OR && LHS->hasOneUse() && isOrXorChain(LHS, NumXors, WorkList)) { SDValue XOR0, XOR1; std::tie(XOR0, XOR1) = WorkList[0]; unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR; SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); for (unsigned I = 1; I < WorkList.size(); I++) { std::tie(XOR0, XOR1) = WorkList[I]; SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain); } // Exit early by inverting the condition, which help reduce indentations. return Cmp; } return SDValue(); } SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); bool IsStrict = Op->isStrictFPOpcode(); bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; unsigned OpNo = IsStrict ? 1 : 0; SDValue Chain; if (IsStrict) Chain = Op.getOperand(0); SDValue LHS = Op.getOperand(OpNo + 0); SDValue RHS = Op.getOperand(OpNo + 1); ISD::CondCode CC = cast(Op.getOperand(OpNo + 2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. EVT VT = Op.getValueType(); SDValue TVal = DAG.getConstant(1, dl, VT); SDValue FVal = DAG.getConstant(0, dl, VT); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; } } if (LHS.getValueType().isInteger()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp( LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. SDValue Cmp; if (IsStrict) Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); else Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); SDValue Res; if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in // this case, we emit the first CSEL and then emit a second using the output // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; } SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT VT = LHS.getValueType(); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); SDLoc DL(Op); SDValue Carry = Op.getOperand(2); // SBCS uses a carry not a borrow so the carry flag should be inverted first. SDValue InvCarry = valueToCarryFlag(Carry, DAG, true); SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue), LHS, RHS, InvCarry); EVT OpVT = Op.getValueType(); SDValue TVal = DAG.getConstant(1, DL, OpVT); SDValue FVal = DAG.getConstant(0, DL, OpVT); ISD::CondCode Cond = cast(Op.getOperand(3))->get(); ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT); SDValue CCVal = DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32); // Inputs are swapped because the condition is inverted. This will allow // matching with a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal, Cmp.getValue(1)); } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Also handle f16, for which we need to do a f32 comparison. if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); } // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); ConstantSDNode *RHSC = dyn_cast(RHS); // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform // into (OR (ASR lhs, N-1), 1), which requires less instructions for the // supported types. if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && CTVal->isOne() && CFVal->isAllOnes() && LHS.getValueType() == TVal.getValueType()) { EVT VT = LHS.getValueType(); SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, LHS, DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); } // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns. // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1)) // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1)) // Both require less instructions than compare and conditional select. if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal && RHSC && RHSC->isZero() && CFVal && CFVal->isZero() && LHS.getValueType() == RHS.getValueType()) { EVT VT = LHS.getValueType(); SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, LHS, DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); if (CC == ISD::SETGT) Shift = DAG.getNOT(dl, Shift, VT); return DAG.getNode(ISD::AND, dl, VT, LHS, Shift); } unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in // order to for a CSINV or CSINC out of them. if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } } else if (CTVal && CFVal) { const int64_t TrueVal = CTVal->getSExtValue(); const int64_t FalseVal = CFVal->getSExtValue(); bool Swap = false; // If both TVal and FVal are constants, see if FVal is the // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC // instead of a CSEL in that case. if (TrueVal == ~FalseVal) { Opcode = AArch64ISD::CSINV; } else if (FalseVal > std::numeric_limits::min() && TrueVal == -FalseVal) { Opcode = AArch64ISD::CSNEG; } else if (TVal.getValueType() == MVT::i32) { // If our operands are only 32-bit wide, make sure we use 32-bit // arithmetic for the check whether we can use CSINC. This ensures that // the addition in the check will wrap around properly in case there is // an overflow (which would not be the case if we do the check with // 64-bit arithmetic). const uint32_t TrueVal32 = CTVal->getZExtValue(); const uint32_t FalseVal32 = CFVal->getZExtValue(); if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { Opcode = AArch64ISD::CSINC; if (TrueVal32 > FalseVal32) { Swap = true; } } } else { // 64-bit check whether we can use CSINC. const uint64_t TrueVal64 = TrueVal; const uint64_t FalseVal64 = FalseVal; if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) { Opcode = AArch64ISD::CSINC; if (TrueVal > FalseVal) { Swap = true; } } } // Swap TVal and FVal if necessary. if (Swap) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } if (Opcode != AArch64ISD::CSEL) { // Drop FVal since we can get its value by simply inverting/negating // TVal. FVal = TVal; } } // Avoid materializing a constant when possible by reusing a known value in // a register. However, don't perform this optimization if the known value // is one, zero or negative one in the case of a CSEL. We can always // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the // FVal, respectively. ConstantSDNode *RHSVal = dyn_cast(RHS); if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && !RHSVal->isZero() && !RHSVal->isAllOnes()) { AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to // "a != C ? x : a" to avoid materializing C. if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) TVal = LHS; else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) FVal = LHS; } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { assert (CTVal && CFVal && "Expected constant operands for CSNEG."); // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to // avoid materializing C. AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { Opcode = AArch64ISD::CSINV; TVal = LHS; FVal = DAG.getConstant(0, dl, FVal.getValueType()); } } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); EVT VT = TVal.getValueType(); return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two CSELs to implement. AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (DAG.getTarget().Options.UnsafeFPMath) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast(RHS); if (RHSVal && RHSVal->isZero()) { ConstantFPSDNode *CFVal = dyn_cast(FVal); ConstantFPSDNode *CTVal = dyn_cast(TVal); if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) TVal = LHS; else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && CFVal && CFVal->isZero() && FVal.getValueType() == LHS.getValueType()) FVal = LHS; } } // Emit first, and possibly only, CSEL. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } // Otherwise, return the output of the first CSEL. return CS1; } SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const { EVT Ty = Op.getValueType(); auto Idx = Op.getConstantOperandAPInt(2); int64_t IdxVal = Idx.getSExtValue(); assert(Ty.isScalableVector() && "Only expect scalable vectors for custom lowering of VECTOR_SPLICE"); // We can use the splice instruction for certain index values where we are // able to efficiently generate the correct predicate. The index will be // inverted and used directly as the input to the ptrue instruction, i.e. // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the // splice predicate. However, we can only do this if we can guarantee that // there are enough elements in the vector, hence we check the index <= min // number of elements. std::optional PredPattern; if (Ty.isScalableVector() && IdxVal < 0 && (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) != std::nullopt) { SDLoc DL(Op); // Create a predicate where all but the last -IdxVal elements are false. EVT PredVT = Ty.changeVectorElementType(MVT::i1); SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern); Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); // Now splice the two inputs together using the predicate. return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), Op.getOperand(1)); } // This will select to an EXT instruction, which has a maximum immediate // value of 255, hence 2048-bits is the maximum value we can lower. if (IdxVal >= 0 && IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits())) return Op; return SDValue(); } SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); SDLoc DL(Op); return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); SDLoc DL(Op); EVT Ty = Op.getValueType(); if (Ty == MVT::aarch64svcount) { TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal); FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal); SDValue Sel = DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal); return DAG.getNode(ISD::BITCAST, DL, Ty, Sel); } if (Ty.isScalableVector()) { MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal); return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); } if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) { // FIXME: Ideally this would be the same as above using i1 types, however // for the moment we can't deal with fixed i1 vector types properly, so // instead extend the predicate to a result type sized integer vector. MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. if (ISD::isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // Lower it the same way as we would lower a SELECT_CC node. ISD::CondCode CC; SDValue LHS, RHS; if (CCVal.getOpcode() == ISD::SETCC) { LHS = CCVal.getOperand(0); RHS = CCVal.getOperand(1); CC = cast(CCVal.getOperand(2))->get(); } else { LHS = CCVal; RHS = DAG.getConstant(0, DL, CCVal.getValueType()); CC = ISD::SETNE; } // If we are lowering a f16 and we do not have fullf16, convert to a f32 in // order to use FCSELSrrr if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, DAG.getUNDEF(MVT::f32), TVal); FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, DAG.getUNDEF(MVT::f32), FVal); } SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); } return Res; } SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { return getAddrLarge(JT, DAG); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { return getAddrTiny(JT, DAG); } return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. SDLoc DL(Op); SDValue JT = Op.getOperand(1); SDValue Entry = Op.getOperand(2); int JTI = cast(JT.getNode())->getIndex(); auto *AFI = DAG.getMachineFunction().getInfo(); AFI->setJumpTableEntryInfo(JTI, 4, nullptr); SDNode *Dest = DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), SDValue(Dest, 0)); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { return getGOT(CP, DAG); } return getAddrLarge(CP, DAG); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { return getAddrTiny(CP, DAG); } else { return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { BlockAddressSDNode *BA = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { return getAddrLarge(BA, DAG); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { return getAddrTiny(BA, DAG); } return getAddr(BA, DAG); } SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const { AArch64FunctionInfo *FuncInfo = DAG.getMachineFunction().getInfo(); SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); SDLoc DL(Op); SDValue FR; if (Subtarget->isWindowsArm64EC()) { // With the Arm64EC ABI, we compute the address of the varargs save area // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry, // but calls from an entry thunk can pass in a different address. Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64); uint64_t StackOffset; if (FuncInfo->getVarArgsGPRSize() > 0) StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize(); else StackOffset = FuncInfo->getVarArgsStackOffset(); FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, DAG.getConstant(StackOffset, DL, MVT::i64)); } else { FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 ? FuncInfo->getVarArgsGPRIndex() : FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); } const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAList = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); SmallVector MemOps; // void *__stack at offset 0 unsigned Offset = 0; SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), Align(PtrSize))); // void *__gr_top at offset 8 (4 on ILP32) Offset += PtrSize; int GPRSize = FuncInfo->getVarArgsGPRSize(); if (GPRSize > 0) { SDValue GRTop, GRTopAddr; GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Offset, DL, PtrVT)); GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, DAG.getConstant(GPRSize, DL, PtrVT)); GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, Offset), Align(PtrSize))); } // void *__vr_top at offset 16 (8 on ILP32) Offset += PtrSize; int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Offset, DL, PtrVT)); VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, DAG.getConstant(FPRSize, DL, PtrVT)); VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, Offset), Align(PtrSize))); } // int __gr_offs at offset 24 (12 on ILP32) Offset += PtrSize; SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Offset, DL, PtrVT)); MemOps.push_back( DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); // int __vr_offs at offset 28 (16 on ILP32) Offset += 4; SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Offset, DL, PtrVT)); MemOps.push_back( DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) return LowerWin64_VASTART(Op, DAG); else if (Subtarget->isTargetDarwin()) return LowerDarwin_VASTART(Op, DAG); else return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; unsigned VaListSize = (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) ? PtrSize : Subtarget->isTargetILP32() ? 20 : 32; const Value *DestSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(VaListSize, DL, MVT::i32), Align(PtrSize), false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "automatic va_arg instruction only works on Darwin"); const Value *V = cast(Op.getOperand(2))->getValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); MaybeAlign Align(Op.getConstantOperandVal(3)); unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); SDValue VAList = DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); if (VT.isScalableVector()) report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); if (Align && *Align > MinSlotSize) { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align->value() - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) ArgSize = std::max(ArgSize, MinSlotSize); bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; NeedFPTrunc = true; } // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL, /*isTarget=*/true)); SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; // Merge the rounded value with the chain output of the load. return DAG.getMergeValues(Ops, DL); } return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); if (Subtarget->isTargetILP32()) FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, DAG.getValueType(VT)); return FrameAddr; } SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); EVT VT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); int FI = MFI.CreateFixedObject(4, 0, false); return DAG.getFrameIndex(FI, VT); } #define GET_REGISTER_MATCHER #include "AArch64GenAsmMatcher.inc" // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. Register AArch64TargetLowering:: getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = MatchRegisterName(RegName); if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); if (!Subtarget->isXRegisterReserved(DwarfRegNum)) Reg = 0; } if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const { DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); } SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDValue ReturnAddress; if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); ReturnAddress = DAG.getLoad( VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); } else { // Return LR, which contains the return address. Mark it an implicit // live-in. Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } // The XPACLRI instruction assembles to a hint-space instruction before // Armv8.3-A therefore this instruction can be safely used for any pre // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use // that instead. SDNode *St; if (Subtarget->hasPAuth()) { St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); } else { // XPACLRI operates on LR therefore we must move the operand accordingly. SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); } return SDValue(St, 0); } /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { SDValue Lo, Hi; expandShiftParts(Op.getNode(), Lo, Hi, DAG); return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); } bool AArch64TargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // Offsets are folded in the DAG combine rather than here so that we can // intelligently choose an offset based on the uses. return false; } bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool OptForSize) const { bool IsLegal = false; // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and // 16-bit case when target has full fp16 support. // FIXME: We should be able to handle f128 as well with a clever lowering. const APInt ImmInt = Imm.bitcastToAPInt(); if (VT == MVT::f64) IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); else if (VT == MVT::f32) IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); else if (VT == MVT::f16 || VT == MVT::bf16) IsLegal = (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) || Imm.isPosZero(); // If we can not materialize in immediate field for fmov, check if the // value can be encoded as the immediate operand of a logical instruction. // The immediate value will be created with either MOVZ, MOVN, or ORR. // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to // generate that fmov. if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; // however the mov+fmov sequence is always better because of the reduced // cache pressure. The timings are still the same if you consider // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the // movw+movk is fused). So we limit up to 2 instrdduction at most. SmallVector Insn; AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); IsLegal = Insn.size() <= Limit; } LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT << " imm value: "; Imm.dump();); return IsLegal; } //===----------------------------------------------------------------------===// // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps) { EVT VT = Operand.getValueType(); if ((ST->hasNEON() && (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || VT == MVT::v4f32)) || (ST->hasSVE() && (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) // For the reciprocal estimates, convergence is quadratic, so the number // of digits is doubled after each iteration. In ARMv8, the accuracy of // the initial estimate is 2^-8. Thus the number of extra steps to refine // the result for float (23 mantissa bits) is 2 and for double (52 // mantissa bits) is 3. ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); } return SDValue(); } SDValue AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, const DenormalMode &Mode) const { SDLoc DL(Op); EVT VT = Op.getValueType(); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); } SDValue AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, SelectionDAG &DAG) const { return Op; } SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const { if (Enabled == ReciprocalEstimate::Enabled || (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, DAG, ExtraSteps)) { SDLoc DL(Operand); EVT VT = Operand.getValueType(); SDNodeFlags Flags; Flags.setAllowReassociation(true); // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, Flags); Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } if (!Reciprocal) Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); ExtraSteps = 0; return Estimate; } return SDValue(); } SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps) const { if (Enabled == ReciprocalEstimate::Enabled) if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, DAG, ExtraSteps)) { SDLoc DL(Operand); EVT VT = Operand.getValueType(); SDNodeFlags Flags; Flags.setAllowReassociation(true); // Newton reciprocal iteration: E * (2 - X * E) // AArch64 reciprocal iteration instruction: (2 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, Estimate, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } ExtraSteps = 0; return Estimate; } return SDValue(); } //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// // Table of Constraints // TODO: This is the current set of constraints supported by ARM for the // compiler, not all of them may make sense. // // r - A general register // w - An FP/SIMD register of some size in the range v0-v31 // x - An FP/SIMD register of some size in the range v0-v15 // I - Constant that can be used with an ADD instruction // J - Constant that can be used with a SUB instruction // K - Constant that can be used with a 32-bit logical instruction // L - Constant that can be used with a 64-bit logical instruction // M - Constant that can be used as a 32-bit MOV immediate // N - Constant that can be used as a 64-bit MOV immediate // Q - A memory reference with base register and no offset // S - A symbolic address // Y - Floating point constant zero // Z - Integer constant zero // // Note that general register operands will be output using their 64-bit x // register name, whatever the size of the variable, unless the asm operand // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { // At this point, we have to lower this constraint to something else, so we // lower it to an "r" or "w". However, by doing this we will force the result // to be in register, while the X constraint is much more permissive. // // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. if (!Subtarget->hasFPARMv8()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; if (ConstraintVT.isVector() && (ConstraintVT.getSizeInBits() == 64 || ConstraintVT.getSizeInBits() == 128)) return "w"; return "r"; } enum PredicateConstraint { Upl, Upa, Invalid }; static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { PredicateConstraint P = PredicateConstraint::Invalid; if (Constraint == "Upa") P = PredicateConstraint::Upa; if (Constraint == "Upl") P = PredicateConstraint::Upl; return P; } // The set of cc code supported is from // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) { AArch64CC::CondCode Cond = StringSwitch(Constraint) .Case("{@cchi}", AArch64CC::HI) .Case("{@cccs}", AArch64CC::HS) .Case("{@cclo}", AArch64CC::LO) .Case("{@ccls}", AArch64CC::LS) .Case("{@cccc}", AArch64CC::LO) .Case("{@cceq}", AArch64CC::EQ) .Case("{@ccgt}", AArch64CC::GT) .Case("{@ccge}", AArch64CC::GE) .Case("{@cclt}", AArch64CC::LT) .Case("{@ccle}", AArch64CC::LE) .Case("{@cchs}", AArch64CC::HS) .Case("{@ccne}", AArch64CC::NE) .Case("{@ccvc}", AArch64CC::VC) .Case("{@ccpl}", AArch64CC::PL) .Case("{@ccvs}", AArch64CC::VS) .Case("{@ccmi}", AArch64CC::MI) .Default(AArch64CC::Invalid); return Cond; } /// Helper function to create 'CSET', which is equivalent to 'CSINC , WZR, /// WZR, invert()'. static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getNode( AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV); } // Lower @cc flag output via getSETCC. SDValue AArch64TargetLowering::LowerAsmOutputForConstraint( SDValue &Chain, SDValue &Glue, const SDLoc &DL, const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); if (Cond == AArch64CC::Invalid) return SDValue(); // The output variable should be a scalar integer. if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || OpInfo.ConstraintVT.getSizeInBits() < 8) report_fatal_error("Flag output operand is of invalid type"); // Get NZCV register. Only update chain when copyfrom is glued. if (Glue.getNode()) { Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue); Chain = Glue.getValue(1); } else Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32); // Extract CC code. SDValue CC = getSETCC(Cond, Glue, DL, DAG); SDValue Result; // Truncate or ZERO_EXTEND based on value types. if (OpInfo.ConstraintVT.getSizeInBits() <= 32) Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC); else Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); return Result; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'x': case 'w': case 'y': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'Y': case 'Z': return C_Immediate; case 'z': case 'S': // A symbolic address return C_Other; } } else if (parsePredicateConstraint(Constraint) != PredicateConstraint::Invalid) return C_RegisterClass; else if (parseConstraintCode(Constraint) != AArch64CC::Invalid) return C_Other; return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight AArch64TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'x': case 'w': case 'y': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; case 'U': if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) weight = CW_Register; break; } return weight; } std::pair AArch64TargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': if (VT.isScalableVector()) return std::make_pair(0U, nullptr); if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); if (VT.getFixedSizeInBits() == 64) return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); case 'w': { if (!Subtarget->hasFPARMv8()) break; if (VT.isScalableVector()) { if (VT.getVectorElementType() != MVT::i1) return std::make_pair(0U, &AArch64::ZPRRegClass); return std::make_pair(0U, nullptr); } uint64_t VTSize = VT.getFixedSizeInBits(); if (VTSize == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); if (VTSize == 32) return std::make_pair(0U, &AArch64::FPR32RegClass); if (VTSize == 64) return std::make_pair(0U, &AArch64::FPR64RegClass); if (VTSize == 128) return std::make_pair(0U, &AArch64::FPR128RegClass); break; } // The instructions that this constraint is designed for can // only take 128-bit registers so just use that regclass. case 'x': if (!Subtarget->hasFPARMv8()) break; if (VT.isScalableVector()) return std::make_pair(0U, &AArch64::ZPR_4bRegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; case 'y': if (!Subtarget->hasFPARMv8()) break; if (VT.isScalableVector()) return std::make_pair(0U, &AArch64::ZPR_3bRegClass); break; } } else { PredicateConstraint PC = parsePredicateConstraint(Constraint); if (PC != PredicateConstraint::Invalid) { if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) return std::make_pair(0U, nullptr); bool restricted = (PC == PredicateConstraint::Upl); return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) : std::make_pair(0U, &AArch64::PPRRegClass); } } if (StringRef("{cc}").equals_insensitive(Constraint) || parseConstraintCode(Constraint) != AArch64CC::Invalid) return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { unsigned Size = Constraint.size(); if ((Size == 4 || Size == 5) && Constraint[0] == '{' && tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. if (VT != MVT::Other && VT.getSizeInBits() == 64) { Res.first = AArch64::FPR64RegClass.getRegister(RegNo); Res.second = &AArch64::FPR64RegClass; } else { Res.first = AArch64::FPR128RegClass.getRegister(RegNo); Res.second = &AArch64::FPR128RegClass; } } } } if (Res.second && !Subtarget->hasFPARMv8() && !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) return std::make_pair(0U, nullptr); return Res; } EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL, llvm::Type *Ty, bool AllowUnknown) const { if (Subtarget->hasLS64() && Ty->isIntegerTy(512)) return EVT(MVT::i64x8); return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void AArch64TargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; // This set of constraints deal with valid constants for various instructions. // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) Result = DAG.getRegister(AArch64::XZR, MVT::i64); else Result = DAG.getRegister(AArch64::WZR, MVT::i32); break; } case 'S': { // An absolute symbolic address or label reference. if (const GlobalAddressSDNode *GA = dyn_cast(Op)) { Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), GA->getValueType(0)); } else if (const BlockAddressSDNode *BA = dyn_cast(Op)) { Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); } else return; break; } case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': ConstantSDNode *C = dyn_cast(Op); if (!C) return; // Grab the value and do some validation. uint64_t CVal = C->getZExtValue(); switch (ConstraintLetter) { // The I constraint applies only to simple ADD or SUB immediate operands: // i.e. 0 to 4095 with optional shift by 12 // The J constraint applies only to ADD or SUB immediates that would be // valid when negated, i.e. if [an add pattern] were to be output as a SUB // instruction [or vice versa], in other words -1 to -4095 with optional // left shift by 12. case 'I': if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) break; return; case 'J': { uint64_t NVal = -C->getSExtValue(); if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { CVal = C->getSExtValue(); break; } return; } // The K and L constraints apply *only* to logical immediates, including // what used to be the MOVI alias for ORR (though the MOVI alias has now // been removed and MOV should be used). So these constraints have to // distinguish between bit patterns that are valid 32-bit or 64-bit // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice // versa. case 'K': if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; return; case 'L': if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; return; // The M and N constraints are a superset of K and L respectively, for use // with the MOV (immediate) alias. As well as the logical immediates they // also match 32 or 64-bit immediates that can be loaded either using a // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca // (M) or 64-bit 0x1234000000000000 (N) etc. // As a note some of this code is liberally stolen from the asm parser. case 'M': { if (!isUInt<32>(CVal)) return; if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; if ((CVal & 0xFFFF) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; uint64_t NCVal = ~(uint32_t)CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; return; } case 'N': { if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; if ((CVal & 0xFFFFULL) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; if ((CVal & 0xFFFF00000000ULL) == CVal) break; if ((CVal & 0xFFFF000000000000ULL) == CVal) break; uint64_t NCVal = ~CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; if ((NCVal & 0xFFFF00000000ULL) == NCVal) break; if ((NCVal & 0xFFFF000000000000ULL) == NCVal) break; return; } default: return; } // All assembler immediates are 64-bit integers. Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } //===----------------------------------------------------------------------===// // AArch64 Advanced SIMD Support //===----------------------------------------------------------------------===// /// WidenVector - Given a value in the V64 register class, produce the /// equivalent value in the V128 register class. static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { EVT VT = V64Reg.getValueType(); unsigned NarrowSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); SDLoc DL(V64Reg); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), V64Reg, DAG.getConstant(0, DL, MVT::i64)); } /// getExtFactor - Determine the adjustment factor for the position when /// generating an "extract from vector registers" instruction. static unsigned getExtFactor(SDValue &V) { EVT EltType = V.getValueType().getVectorElementType(); return EltType.getSizeInBits() / 8; } // Check if a vector is built from one vector via extracted elements of // another together with an AND mask, ensuring that all elements fit // within range. This can be reconstructed using AND and NEON's TBL1. SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); assert(!VT.isScalableVector() && "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map // directly to TBL1. if (VT != MVT::v16i8 && VT != MVT::v8i8) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); assert((NumElts == 8 || NumElts == 16) && "Need to have exactly 8 or 16 elements in vector."); SDValue SourceVec; SDValue MaskSourceVec; SmallVector AndMaskConstants; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue OperandSourceVec = V.getOperand(0); if (!SourceVec) SourceVec = OperandSourceVec; else if (SourceVec != OperandSourceVec) return SDValue(); // This only looks at shuffles with elements that are // a) truncated by a constant AND mask extracted from a mask vector, or // b) extracted directly from a mask vector. SDValue MaskSource = V.getOperand(1); if (MaskSource.getOpcode() == ISD::AND) { if (!isa(MaskSource.getOperand(1))) return SDValue(); AndMaskConstants.push_back(MaskSource.getOperand(1)); MaskSource = MaskSource->getOperand(0); } else if (!AndMaskConstants.empty()) { // Either all or no operands should have an AND mask. return SDValue(); } // An ANY_EXTEND may be inserted between the AND and the source vector // extraction. We don't care about that, so we can just skip it. if (MaskSource.getOpcode() == ISD::ANY_EXTEND) MaskSource = MaskSource.getOperand(0); if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue MaskIdx = MaskSource.getOperand(1); if (!isa(MaskIdx) || !cast(MaskIdx)->getConstantIntValue()->equalsInt(i)) return SDValue(); // We only apply this if all elements come from the same vector with the // same vector type. if (!MaskSourceVec) { MaskSourceVec = MaskSource->getOperand(0); if (MaskSourceVec.getValueType() != VT) return SDValue(); } else if (MaskSourceVec != MaskSource->getOperand(0)) { return SDValue(); } } // We need a v16i8 for TBL, so we extend the source with a placeholder vector // for v8i8 to get a v16i8. As the pattern we are replacing is extract + // insert, we know that the index in the mask must be smaller than the number // of elements in the source, or we would have an out-of-bounds access. if (NumElts == 8) SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec, DAG.getUNDEF(VT)); // Preconditions met, so we can use a vector (AND +) TBL to build this vector. if (!AndMaskConstants.empty()) MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec, DAG.getBuildVector(VT, dl, AndMaskConstants)); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec, MaskSourceVec); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); SDLoc dl(Op); EVT VT = Op.getValueType(); assert(!VT.isScalableVector() && "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt; unsigned MaxElt; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase; int WindowScale; ShuffleSourceInfo(SDValue Vec) : Vec(Vec), MinElt(std::numeric_limits::max()), MaxElt(0), ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(V.getOperand(1)) || V.getOperand(0).getValueType().isScalableVector()) { LLVM_DEBUG( dbgs() << "Reshuffle failed: " "a shuffle can only come from building a vector from " "various elements of other fixed-width vectors, provided " "their indices are constant\n"); return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } // If we have 3 or 4 sources, try to generate a TBL, which will at least be // better than moving to/from gpr registers for larger vectors. if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { // Construct a mask for the tbl. We may need to adjust the index for types // larger than i8. SmallVector Mask; unsigned OutputFactor = VT.getScalarSizeInBits() / 8; for (unsigned I = 0; I < NumElts; ++I) { SDValue V = Op.getOperand(I); if (V.isUndef()) { for (unsigned OF = 0; OF < OutputFactor; OF++) Mask.push_back(-1); continue; } // Set the Mask lanes adjusted for the size of the input and output // lanes. The Mask is always i8, so it will set OutputFactor lanes per // output element, adjusted in their positions per input and output types. unsigned Lane = V.getConstantOperandVal(1); for (unsigned S = 0; S < Sources.size(); S++) { if (V.getOperand(0) == Sources[S].Vec) { unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); unsigned InputBase = 16 * S + Lane * InputSize / 8; for (unsigned OF = 0; OF < OutputFactor; OF++) Mask.push_back(InputBase + OF); break; } } } // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to // v16i8, and the TBLMask SmallVector TBLOperands; TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 ? Intrinsic::aarch64_neon_tbl3 : Intrinsic::aarch64_neon_tbl4, dl, MVT::i32)); for (unsigned i = 0; i < Sources.size(); i++) { SDValue Src = Sources[i].Vec; EVT SrcVT = Src.getValueType(); Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && "Expected a legally typed vector"); if (SrcVT.is64BitVector()) Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, DAG.getUNDEF(MVT::v8i8)); TBLOperands.push_back(Src); } SmallVector TBLMask; for (unsigned i = 0; i < Mask.size(); i++) TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); assert((Mask.size() == 8 || Mask.size() == 16) && "Expected a v8i8 or v16i8 Mask"); TBLOperands.push_back( DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); SDValue Shuffle = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); return DAG.getBitcast(VT, Shuffle); } if (Sources.size() > 2) { LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " << "sensible when at most two source vectors are " << "involved\n"); return SDValue(); } // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) { SmallestEltTy = SrcEltTy; } } unsigned ResMultiplier = VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); uint64_t VTSize = VT.getFixedSizeInBits(); NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); TypeSize SrcVTSize = SrcVT.getSizeInBits(); if (SrcVTSize == TypeSize::Fixed(VTSize)) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVTSize.getFixedValue() < VTSize) { assert(2 * SrcVTSize == VTSize); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } if (SrcVTSize.getFixedValue() != 2 * VTSize) { LLVM_DEBUG( dbgs() << "Reshuffle failed: result vector too small to extract\n"); return SDValue(); } if (Src.MaxElt - Src.MinElt >= NumSrcElts) { LLVM_DEBUG( dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); if (!SrcVT.is64BitVector()) { LLVM_DEBUG( dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " "for SVE vectors."); return SDValue(); } Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT);); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.isUndef()) continue; auto Src = find(Sources, Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) { LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); return SDValue(); } SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask); SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); dbgs() << "Reshuffle, creating node: "; V.dump();); return V; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from // v4i32s. This is really a truncate, which we can construct out of (legal) // concats and truncate nodes. static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) { if (V.getValueType() != MVT::v16i8) return SDValue(); assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR"); for (unsigned X = 0; X < 4; X++) { // Check the first item in each group is an extract from lane 0 of a v4i32 // or v4i16. SDValue BaseExt = V.getOperand(X * 4); if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || (BaseExt.getOperand(0).getValueType() != MVT::v4i16 && BaseExt.getOperand(0).getValueType() != MVT::v4i32) || !isa(BaseExt.getOperand(1)) || BaseExt.getConstantOperandVal(1) != 0) return SDValue(); SDValue Base = BaseExt.getOperand(0); // And check the other items are extracts from the same vector. for (unsigned Y = 1; Y < 4; Y++) { SDValue Ext = V.getOperand(X * 4 + Y); if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Ext.getOperand(0) != Base || !isa(Ext.getOperand(1)) || Ext.getConstantOperandVal(1) != Y) return SDValue(); } } // Turn the buildvector into a series of truncates and concates, which will // become uzip1's. Any v4i32s we found get truncated to v4i16, which are // concat together to produce 2 v8i16. These are both truncated and concat // together. SDLoc DL(V); SDValue Trunc[4] = { V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0), V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)}; for (SDValue &V : Trunc) if (V.getValueType() == MVT::v4i32) V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V); SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]); SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]); SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1); return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1); } /// Check if a vector shuffle corresponds to a DUP instructions with a larger /// element width than the vector lane type. If that is the case the function /// returns true and writes the value of the DUP instruction lane operand into /// DupLaneOp static bool isWideDUPMask(ArrayRef M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && "Only possible block sizes for wide DUP are: 16, 32, 64"); if (BlockSize <= VT.getScalarSizeInBits()) return false; if (BlockSize % VT.getScalarSizeInBits() != 0) return false; if (VT.getSizeInBits() % BlockSize != 0) return false; size_t SingleVecNumElements = VT.getVectorNumElements(); size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); size_t NumBlocks = VT.getSizeInBits() / BlockSize; // We are looking for masks like // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element // might be replaced by 'undefined'. BlockIndices will eventually contain // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] // for the above examples) SmallVector BlockElts(NumEltsPerBlock, -1); for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) for (size_t I = 0; I < NumEltsPerBlock; I++) { int Elt = M[BlockIndex * NumEltsPerBlock + I]; if (Elt < 0) continue; // For now we don't support shuffles that use the second operand if ((unsigned)Elt >= SingleVecNumElements) return false; if (BlockElts[I] < 0) BlockElts[I] = Elt; else if (BlockElts[I] != Elt) return false; } // We found a candidate block (possibly with some undefs). It must be a // sequence of consecutive integers starting with a value divisible by // NumEltsPerBlock with some values possibly replaced by undef-s. // Find first non-undef element auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); assert(FirstRealEltIter != BlockElts.end() && "Shuffle with all-undefs must have been caught by previous cases, " "e.g. isSplat()"); if (FirstRealEltIter == BlockElts.end()) { DupLaneOp = 0; return true; } // Index of FirstRealElt in BlockElts size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); if ((unsigned)*FirstRealEltIter < FirstRealIndex) return false; // BlockElts[0] must have the following value if it isn't undef: size_t Elt0 = *FirstRealEltIter - FirstRealIndex; // Check the first element if (Elt0 % NumEltsPerBlock != 0) return false; // Check that the sequence indeed consists of consecutive integers (modulo // undefs) for (size_t I = 0; I < NumEltsPerBlock; I++) if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) return false; DupLaneOp = Elt0 / NumEltsPerBlock; return true; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are different. static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, unsigned &Imm) { // Look for the first non-undef element. const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); // Benefit form APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); // The following shuffle indices must be the successive elements after the // first real element. bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { return Elt != ExpectedElt++ && Elt != -1; }); if (FoundWrongElt) return false; // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. Imm = ExpectedElt.getZExtValue(); // There are two difference cases requiring to reverse input vectors. // For example, for vector <4 x i32> we have the following cases, // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) // For both cases, we finally use mask <5, 6, 7, 0>, which requires // to reverse two input vectors. if (Imm < NumElts) ReverseEXT = true; else Imm -= NumElts; return true; } /// isREVMask - Check if a vector shuffle corresponds to a REV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 || BlockSize == 128) && "Only possible block sizes for REV are: 16, 32, 64, 128"); unsigned EltSz = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) return false; } return true; } static bool isZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) return false; Idx += 1; } return true; } static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i != NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != 2 * i + WhichResult) return false; } return true; } static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) return false; } return true; } /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) return false; Idx += 1; } return true; } /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned Half = VT.getVectorNumElements() / 2; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned j = 0; j != 2; ++j) { unsigned Idx = WhichResult; for (unsigned i = 0; i != Half; ++i) { int MIdx = M[i + j * Half]; if (MIdx >= 0 && (unsigned)MIdx != Idx) return false; Idx += 2; } } return true; } /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) return false; } return true; } static bool isINSMask(ArrayRef M, int NumInputElements, bool &DstIsLeft, int &Anomaly) { if (M.size() != static_cast(NumInputElements)) return false; int NumLHSMatch = 0, NumRHSMatch = 0; int LastLHSMismatch = -1, LastRHSMismatch = -1; for (int i = 0; i < NumInputElements; ++i) { if (M[i] == -1) { ++NumLHSMatch; ++NumRHSMatch; continue; } if (M[i] == i) ++NumLHSMatch; else LastLHSMismatch = i; if (M[i] == i + NumInputElements) ++NumRHSMatch; else LastRHSMismatch = i; } if (NumLHSMatch == NumInputElements - 1) { DstIsLeft = true; Anomaly = LastLHSMismatch; return true; } else if (NumRHSMatch == NumInputElements - 1) { DstIsLeft = false; Anomaly = LastRHSMismatch; return true; } return false; } static bool isConcatMask(ArrayRef Mask, EVT VT, bool SplitLHS) { if (VT.getSizeInBits() != 128) return false; unsigned NumElts = VT.getVectorNumElements(); for (int I = 0, E = NumElts / 2; I != E; I++) { if (Mask[I] != I) return false; } int Offset = NumElts / 2; for (int I = NumElts / 2, E = NumElts; I != E; I++) { if (Mask[I] != I + SplitLHS * Offset) return false; } return true; } static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue V0 = Op.getOperand(0); SDValue V1 = Op.getOperand(1); ArrayRef Mask = cast(Op)->getMask(); if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || VT.getVectorElementType() != V1.getValueType().getVectorElementType()) return SDValue(); bool SplitV0 = V0.getValueSizeInBits() == 128; if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); } if (V1.getValueSizeInBits() == 128) { V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, DAG.getConstant(0, DL, MVT::i64)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. ID is the perfect-shuffle //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle //table entry and LHS/RHS are the immediate inputs for this stage of the //shuffle. static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR, // VTRN, right result OP_MOVLANE // Move lane. RHSID is the lane to move into }; if (OpNum == OP_COPY) { if (LHSID == (1 * 9 + 2) * 9 + 3) return LHS; assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); return RHS; } if (OpNum == OP_MOVLANE) { // Decompose a PerfectShuffle ID to get the Mask for lane Elt auto getPFIDLane = [](unsigned ID, int Elt) -> int { assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); Elt = 3 - Elt; while (Elt > 0) { ID /= 9; Elt--; } return (ID % 9 == 8) ? -1 : ID % 9; }; // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We // get the lane to move from from the PFID, which is always from the // original vectors (V1 or V2). SDValue OpLHS = GeneratePerfectShuffle( LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); assert(RHSID < 8 && "Expected a lane index for RHSID!"); unsigned ExtLane = 0; SDValue Input; // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs // convert into a higher type. if (RHSID & 0x4) { int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; if (MaskElt == -1) MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); Input = MaskElt < 2 ? V1 : V2; if (VT.getScalarSizeInBits() == 16) { Input = DAG.getBitcast(MVT::v2f32, Input); OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS); } else { assert(VT.getScalarSizeInBits() == 32 && "Expected 16 or 32 bit shuffle elemements"); Input = DAG.getBitcast(MVT::v2f64, Input); OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS); } } else { int MaskElt = getPFIDLane(ID, RHSID); assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); Input = MaskElt < 4 ? V1 : V2; // Be careful about creating illegal types. Use f16 instead of i16. if (VT == MVT::v4i16) { Input = DAG.getBitcast(MVT::v4f16, Input); OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS); } } SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Input.getValueType().getVectorElementType(), Input, DAG.getVectorIdxConstant(ExtLane, dl)); SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS, Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl)); return DAG.getBitcast(VT, Ins); } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::f16 || VT.getVectorElementType() == MVT::bf16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: { EVT EltTy = VT.getVectorElementType(); unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; else if (EltTy == MVT::i64 || EltTy == MVT::f64) Opcode = AArch64ISD::DUPLANE64; else llvm_unreachable("Invalid vector element type?"); if (VT.getSizeInBits() == 64) OpLHS = WidenVector(OpLHS, DAG); SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); } case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: { unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(Imm, dl, MVT::i32)); } case OP_VUZPL: return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS); case OP_VUZPR: return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS); case OP_VZIPL: return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS); case OP_VZIPR: return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS); case OP_VTRNL: return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS); case OP_VTRNR: return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS); } } static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the TBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); EVT EltVT = Op.getValueType().getVectorElementType(); unsigned BytesPerElt = EltVT.getSizeInBits() / 8; bool Swap = false; if (V1.isUndef() || isZerosVector(V1.getNode())) { std::swap(V1, V2); Swap = true; } // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill // out of range values with 0s. We do need to make sure that any out-of-range // values are really out-of-range for a v16i8 vector. bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode()); MVT IndexVT = MVT::v8i8; unsigned IndexLen = 8; if (Op.getValueSizeInBits() == 128) { IndexVT = MVT::v16i8; IndexLen = 16; } SmallVector TBLMask; for (int Val : ShuffleMask) { for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; if (Swap) Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen; if (IsUndefOrZero && Offset >= IndexLen) Offset = 255; TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); } } SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; if (IsUndefOrZero) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, V2Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); } static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; if (EltType == MVT::i64 || EltType == MVT::f64) return AArch64ISD::DUPLANE64; llvm_unreachable("Invalid vector element type?"); } static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG) { // Try to eliminate a bitcasted extract subvector before a DUPLANE. auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { // Match: dup (bitcast (extract_subv X, C)), LaneC if (BitCast.getOpcode() != ISD::BITCAST || BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; // The extract index must align in the destination type. That may not // happen if the bitcast is from narrow to wide type. SDValue Extract = BitCast.getOperand(0); unsigned ExtIdx = Extract.getConstantOperandVal(1); unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); if (ExtIdxInBits % CastedEltBitWidth != 0) return false; // Can't handle cases where vector size is not 128-bit if (!Extract.getOperand(0).getValueType().is128BitVector()) return false; // Update the lane value by offsetting with the scaled extract index. LaneC += ExtIdxInBits / CastedEltBitWidth; // Determine the casted vector type of the wide vector input. // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' // Examples: // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 unsigned SrcVecNumElts = Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), SrcVecNumElts); return true; }; MVT CastVT; if (getScaledOffsetDup(V, Lane, CastVT)) { V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.getOperand(0).getValueType().is128BitVector()) { // The lane is incremented by the index of the extract. // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 Lane += V.getConstantOperandVal(1); V = V.getOperand(0); } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { // The lane is decremented if we are splatting from the 2nd operand. // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; Lane -= Idx * VT.getVectorNumElements() / 2; V = WidenVector(V.getOperand(Idx), DAG); } else if (VT.getSizeInBits() == 64) { // Widen the operand to 128-bit register with undef. V = WidenVector(V, DAG); } return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); } // Return true if we can get a new shuffle mask by checking the parameter mask // array to test whether every two adjacent mask values are continuous and // starting from an even number. static bool isWideTypeMask(ArrayRef M, EVT VT, SmallVectorImpl &NewMask) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; NewMask.clear(); for (unsigned i = 0; i < NumElts; i += 2) { int M0 = M[i]; int M1 = M[i + 1]; // If both elements are undef, new mask is undef too. if (M0 == -1 && M1 == -1) { NewMask.push_back(-1); continue; } if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { NewMask.push_back(M1 / 2); continue; } if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { NewMask.push_back(M0 / 2); continue; } NewMask.clear(); return false; } assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); return true; } // Try to widen element type to get a new mask value for a better permutation // sequence, so that we can use NEON shuffle instructions, such as zip1/2, // UZP1/2, TRN1/2, REV, INS, etc. // For example: // shufflevector <4 x i32> %a, <4 x i32> %b, // <4 x i32> // is equivalent to: // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> // Finally, we can get: // mov v0.d[0], v1.d[1] static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op.getValueType(); EVT ScalarVT = VT.getVectorElementType(); unsigned ElementSize = ScalarVT.getFixedSizeInBits(); SDValue V0 = Op.getOperand(0); SDValue V1 = Op.getOperand(1); ArrayRef Mask = cast(Op)->getMask(); // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ... // We need to make sure the wider element type is legal. Thus, ElementSize // should be not larger than 32 bits, and i1 type should also be excluded. if (ElementSize > 32 || ElementSize == 1) return SDValue(); SmallVector NewMask; if (isWideTypeMask(Mask, VT, NewMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(ElementSize * 2) : MVT::getIntegerVT(ElementSize * 2); MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { V0 = DAG.getBitcast(NewVT, V0); V1 = DAG.getBitcast(NewVT, V1); return DAG.getBitcast(VT, DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); } } return SDValue(); } // Try to fold shuffle (tbl2, tbl2) into a single tbl4. static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { SDValue Tbl1 = Op->getOperand(0); SDValue Tbl2 = Op->getOperand(1); SDLoc dl(Op); SDValue Tbl2ID = DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); EVT VT = Op.getValueType(); if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || Tbl1->getOperand(0) != Tbl2ID || Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || Tbl2->getOperand(0) != Tbl2ID) return SDValue(); if (Tbl1->getValueType(0) != MVT::v16i8 || Tbl2->getValueType(0) != MVT::v16i8) return SDValue(); SDValue Mask1 = Tbl1->getOperand(3); SDValue Mask2 = Tbl2->getOperand(3); SmallVector TBLMaskParts(16, SDValue()); for (unsigned I = 0; I < 16; I++) { if (ShuffleMask[I] < 16) TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]); else { auto *C = dyn_cast(Mask2->getOperand(ShuffleMask[I] - 16)); if (!C) return SDValue(); TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); } } SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); } // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros, // but we don't have an appropriate instruction, // so custom-lower it as ZIP1-with-zeros. SDValue AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); SDValue SrcOp = Op.getOperand(0); EVT SrcVT = SrcOp.getValueType(); assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 && "Unexpected extension factor."); unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); // FIXME: support multi-step zipping? if (Scale != 2) return SDValue(); SDValue Zeros = DAG.getConstant(0, dl, SrcVT); return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros)); } SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. ArrayRef ShuffleMask = SVN->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!"); assert(ShuffleMask.size() == VT.getVectorNumElements() && "Unexpected VECTOR_SHUFFLE mask size!"); if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) return Res; if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), V1.getOperand(0)); // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- // constant. If so, we can just reference the lane's definition directly. if (V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(Lane))) return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); return constructDup(V1, Lane, dl, VT, Opcode, DAG); } // Check if the mask matches a DUP for a wider element for (unsigned LaneSize : {64U, 32U, 16U}) { unsigned Lane = 0; if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 : LaneSize == 32 ? AArch64ISD::DUPLANE32 : AArch64ISD::DUPLANE16; // Cast V1 to an integer vector with required lane size MVT NewEltTy = MVT::getIntegerVT(LaneSize); unsigned NewEltCount = VT.getSizeInBits() / LaneSize; MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); V1 = DAG.getBitcast(NewVecTy, V1); // Constuct the DUP instruction V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); // Cast back to the original type return DAG.getBitcast(VT, V1); } } if (isREVMask(ShuffleMask, VT, 64)) return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 32)) return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 16)) return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) || (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) && ShuffleVectorInst::isReverseMask(ShuffleMask)) { SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1); return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev, DAG.getConstant(8, dl, MVT::i32)); } bool ReverseEXT = false; unsigned Imm; if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { if (ReverseEXT) std::swap(V1, V2); Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } unsigned WhichResult; if (isZIPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isUZPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isTRNMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) return Concat; bool DstIsLeft; int Anomaly; int NumInputElements = V1.getValueType().getVectorNumElements(); if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { SDValue DstVec = DstIsLeft ? V1 : V2; SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); SDValue SrcVec = V1; int SrcLane = ShuffleMask[Anomaly]; if (SrcLane >= NumInputElements) { SrcVec = V2; SrcLane -= VT.getVectorNumElements(); } SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), DstLaneV); } if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) return NewSD; // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG, dl); } return GenerateTBL(Op, ShuffleMask, DAG); } SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 && "Unexpected vector type!"); // We can handle the constant cases during isel. if (isa(Op.getOperand(0))) return Op; // There isn't a natural way to handle the general i1 case, so we use some // trickery with whilelo. SDLoc DL(Op); SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64); SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal, DAG.getValueType(MVT::i1)); SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); SDValue Zero = DAG.getConstant(0, DL, MVT::i64); if (VT == MVT::nxv1i1) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID, Zero, SplatVal), Zero); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal); } SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); if (!isTypeLegal(VT) || !VT.isScalableVector()) return SDValue(); // Current lowering only supports the SVE-ACLE types. if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) return SDValue(); // The DUPQ operation is indepedent of element type so normalise to i64s. SDValue Idx128 = Op.getOperand(2); // DUPQ can be used when idx is in range. auto *CIdx = dyn_cast(Idx128); if (CIdx && (CIdx->getZExtValue() <= 3)) { SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI); } SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); // The ACLE says this must produce the same result as: // svtbl(data, svadd_x(svptrue_b64(), // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), // index * 2)) SDValue One = DAG.getConstant(1, DL, MVT::i64); SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); // create the vector 0,1,0,1,... SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64); SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); // create the vector idx64,idx64+1,idx64,idx64+1,... SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); return DAG.getNode(ISD::BITCAST, DL, VT, TBL); } static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; for (unsigned i = 0; i < NumSplats; ++i) { CnstBits <<= SplatBitSize; UndefBits <<= SplatBitSize; CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); } return true; } return false; } // Try 64-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; if (AArch64_AM::isAdvSIMDModImmType10(Value)) { Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 32-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS = nullptr) { EVT VT = Op.getValueType(); if (VT.isFixedLengthVector() && !DAG.getSubtarget().isNeonAvailable()) return SDValue(); if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); Shift = 0; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); Shift = 8; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); Shift = 16; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); Shift = 24; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov; if (LHS) Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS), DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); else Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 16-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS = nullptr) { EVT VT = Op.getValueType(); if (VT.isFixedLengthVector() && !DAG.getSubtarget().isNeonAvailable()) return SDValue(); if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); Shift = 0; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); Shift = 8; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov; if (LHS) Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS), DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); else Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 32-bit splatted SIMD immediate with shifted ones. static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); Shift = 264; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); Shift = 272; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 8-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; if (AArch64_AM::isAdvSIMDModImmType9(Value)) { Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try FP splatted SIMD immediate. static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); bool isWide = (VT.getSizeInBits() == 128); MVT MovTy; bool isAdvSIMDModImm = false; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); MovTy = isWide ? MVT::v4f32 : MVT::v2f32; } else if (isWide && (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); MovTy = MVT::v2f64; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Specialized code to quickly find if PotentialBVec is a BuildVector that // consists of only the same constant int value, returned in reference arg // ConstVal static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal) { BuildVectorSDNode *Bvec = dyn_cast(PotentialBVec); if (!Bvec) return false; ConstantSDNode *FirstElt = dyn_cast(Bvec->getOperand(0)); if (!FirstElt) return false; EVT VT = Bvec->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 1; i < NumElts; ++i) if (dyn_cast(Bvec->getOperand(i)) != FirstElt) return false; ConstVal = FirstElt->getZExtValue(); return true; } // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a // BUILD_VECTORs with constant element C1, C2 is a constant, and: // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDLoc DL(N); SDValue And; SDValue Shift; SDValue FirstOp = N->getOperand(0); unsigned FirstOpc = FirstOp.getOpcode(); SDValue SecondOp = N->getOperand(1); unsigned SecondOpc = SecondOp.getOpcode(); // Is one of the operands an AND or a BICi? The AND may have been optimised to // a BICi in order to use an immediate instead of a register. // Is the other operand an shl or lshr? This will have been turned into: // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { And = FirstOp; Shift = SecondOp; } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { And = SecondOp; Shift = FirstOp; } else return SDValue(); bool IsAnd = And.getOpcode() == ISD::AND; bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); if (!C2node) return SDValue(); uint64_t C1; if (IsAnd) { // Is the and mask vector all constant? if (!isAllConstantBuildVector(And.getOperand(1), C1)) return SDValue(); } else { // Reconstruct the corresponding AND immediate from the two BICi immediates. ConstantSDNode *C1nodeImm = dyn_cast(And.getOperand(1)); ConstantSDNode *C1nodeShift = dyn_cast(And.getOperand(2)); assert(C1nodeImm && C1nodeShift); C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); } // Is C1 == ~(Ones(ElemSizeInBits) << C2) or // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account // how much one can shift elements of a particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); APInt C1AsAPInt(ElemSizeInBits, C1); APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) : APInt::getLowBitsSet(ElemSizeInBits, C2); if (C1AsAPInt != RequiredC1) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); LLVM_DEBUG(dbgs() << "into: \n"); LLVM_DEBUG(ResultSLI->dump(&DAG)); ++NumShiftInserts; return ResultSLI; } SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1).getNode()); if (!BVN) { // OR commutes, so try swapping the operands. LHS = Op.getOperand(1); BVN = dyn_cast(Op.getOperand(0).getNode()); } if (!BVN) return Op; APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS))) return NewOp; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS))) return NewOp; } // We can always fall back to a non-immediate OR. return Op; } // Normalize the operands of BUILD_VECTOR. The value of constant operands will // be truncated to fit element width. static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT EltTy= VT.getVectorElementType(); if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) return Op; SmallVector Ops; for (SDValue Lane : Op->ops()) { // For integer vectors, type legalization would have promoted the // operands already. Otherwise, if Op is a floating-point splat // (with operands cast to integers), then the only possibilities // are constants and UNDEFs. if (auto *CstLane = dyn_cast(Lane)) { APInt LowBits(EltTy.getSizeInBits(), CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } else if (Lane.getNode()->isUndef()) { Lane = DAG.getUNDEF(MVT::i32); } else { assert(Lane.getValueType() == MVT::i32 && "Unexpected BUILD_VECTOR operand type"); } Ops.push_back(Lane); } return DAG.getBuildVector(VT, dl, Ops); } static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); BuildVectorSDNode *BVN = cast(Op.getNode()); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) return NewOp; DefBits = ~DefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) return NewOp; DefBits = UndefBits; if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) return NewOp; DefBits = ~UndefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) return NewOp; } return SDValue(); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { if (auto SeqInfo = cast(Op)->isConstantSequence()) { SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); return convertFromScalableVector(DAG, Op.getValueType(), Seq); } // Revert to common legalisation for all other variants. return SDValue(); } // Try to build a simple constant vector. Op = NormalizeBuildVector(Op, DAG); // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so, // abort. if (Op.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // Certain vector constants, used to express things like logical NOT and // arithmetic NEG, are passed through unmodified. This allows special // patterns for these operations to match, which will lower these constants // to whatever is proven necessary. BuildVectorSDNode *BVN = cast(Op.getNode()); if (BVN->isConstant()) { if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { unsigned BitSize = VT.getVectorElementType().getSizeInBits(); APInt Val(BitSize, Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); if (Val.isZero() || (VT.isInteger() && Val.isAllOnes())) return Op; } if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode()) if (Const->isZero() && !Const->isNegative()) return Op; } if (SDValue V = ConstantBuildVector(Op, DAG)) return V; // Scan through the operands to find some interesting properties we can // exploit: // 1) If only one value is used, we can use a DUP, or // 2) if only the low element is not undef, we can just insert that, or // 3) if only one constant value is used (w/ some non-constant lanes), // we can splat the constant value into the whole vector then fill // in the non-constant lanes. // 4) FIXME: If different constant values are used, but we can intelligently // select the values we'll be overwriting for the non-constant // lanes such that we can directly materialize the vector // some other way (MOVI, e.g.), we can be sneaky. // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. SDLoc dl(Op); unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool usesOnlyOneConstantValue = true; bool isConstant = true; bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; unsigned NumDifferentLanes = 0; unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; SmallMapVector DifferentValueMap; unsigned ConsecutiveValCount = 0; SDValue PrevVal; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) AllLanesExtractElt = false; if (V.isUndef()) { ++NumUndefLanes; continue; } if (i > 0) isOnlyLowElement = false; if (!isIntOrFPConstant(V)) isConstant = false; if (isIntOrFPConstant(V)) { ++NumConstantLanes; if (!ConstantValue.getNode()) ConstantValue = V; else if (ConstantValue != V) usesOnlyOneConstantValue = false; } if (!Value.getNode()) Value = V; else if (V != Value) { usesOnlyOneValue = false; ++NumDifferentLanes; } if (PrevVal != V) { ConsecutiveValCount = 0; PrevVal = V; } // Keep different values and its last consecutive count. For example, // // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23, // t24, t24, t24, t24, t24, t24, t24, t24 // t23 = consecutive count 8 // t24 = consecutive count 8 // ------------------------------------------------------------------ // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24, // t24, t24, t24, t24, t24, t24, t24, t24 // t23 = consecutive count 5 // t24 = consecutive count 9 DifferentValueMap[V] = ++ConsecutiveValCount; } if (!Value.getNode()) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); return DAG.getUNDEF(VT); } // Convert BUILD_VECTOR where all elements but the lowest are undef into // SCALAR_TO_VECTOR, except for when we have a single-element constant vector // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) { LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " "SCALAR_TO_VECTOR node\n"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); } if (AllLanesExtractElt) { SDNode *Vector = nullptr; bool Even = false; bool Odd = false; // Check whether the extract elements match the Even pattern <0,2,4,...> or // the Odd pattern <1,3,5,...>. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); const SDNode *N = V.getNode(); if (!isa(N->getOperand(1))) { Even = false; Odd = false; break; } SDValue N0 = N->getOperand(0); // All elements are extracted from the same vector. if (!Vector) { Vector = N0.getNode(); // Check that the type of EXTRACT_VECTOR_ELT matches the type of // BUILD_VECTOR. if (VT.getVectorElementType() != N0.getValueType().getVectorElementType()) break; } else if (Vector != N0.getNode()) { Odd = false; Even = false; break; } // Extracted values are either at Even indices <0,2,4,...> or at Odd // indices <1,3,5,...>. uint64_t Val = N->getConstantOperandVal(1); if (Val == 2 * i) { Even = true; continue; } if (Val - 1 == 2 * i) { Odd = true; continue; } // Something does not match: abort. Odd = false; Even = false; break; } if (Even || Odd) { SDValue LHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), DAG.getConstant(0, dl, MVT::i64)); SDValue RHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), DAG.getConstant(NumElts, dl, MVT::i64)); if (Even && !Odd) return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, RHS); if (Odd && !Even) return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, RHS); } } // Use DUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (usesOnlyOneValue) { if (!isConstant) { if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Value.getValueType() != VT) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); } // This is actually a DUPLANExx operation, which keeps everything vectory. SDValue Lane = Value.getOperand(1); Value = Value.getOperand(0); if (Value.getValueSizeInBits() == 64) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " "widening it\n"); Value = WidenVector(Value, DAG); } unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); return DAG.getNode(Opcode, dl, VT, Value, Lane); } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; EVT EltTy = VT.getVectorElementType(); assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; Val.dump();); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } } // If we need to insert a small number of different non-constant elements and // the vector width is sufficiently large, prefer using DUP with the common // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, // skip the constant lane handling below. bool PreferDUPAndInsert = !isConstant && NumDifferentLanes >= 1 && NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && NumDifferentLanes >= NumConstantLanes; // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { // Firstly, try to materialize the splat constant. SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue); unsigned BitSize = VT.getScalarSizeInBits(); APInt ConstantValueAPInt(1, 0); if (auto *C = dyn_cast(ConstantValue)) ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize); if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) && !ConstantValueAPInt.isAllOnes()) { Val = ConstantBuildVector(Val, DAG); if (!Val) // Otherwise, materialize the constant and splat it. Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); } // Now insert the non-constant lanes. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); if (!isIntOrFPConstant(V)) // Note that type legalization likely mucked about with the VT of the // source operand, so we may have to convert it here before inserting. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); } return Val; } // This will generate a load from the constant pool. if (isConstant) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " "expansion\n"); return SDValue(); } // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from // v4i32s. This is really a truncate, which we can construct out of (legal) // concats and truncate nodes. if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG)) return M; // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { if (SDValue Shuffle = ReconstructShuffle(Op, DAG)) return Shuffle; if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG)) return Shuffle; } if (PreferDUPAndInsert) { // First, build a constant vector with the common element. SmallVector Ops(NumElts, Value); SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); // Next, insert the elements that do not match the common value. for (unsigned I = 0; I < NumElts; ++I) if (Op.getOperand(I) != Value) NewVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); return NewVector; } // If vector consists of two different values, try to generate two DUPs and // (CONCAT_VECTORS or VECTOR_SHUFFLE). if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) { SmallVector Vals; // Check the consecutive count of the value is the half number of vector // elements. In this case, we can use CONCAT_VECTORS. For example, // // canUseVECTOR_CONCAT = true; // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23, // t24, t24, t24, t24, t24, t24, t24, t24 // // canUseVECTOR_CONCAT = false; // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24, // t24, t24, t24, t24, t24, t24, t24, t24 bool canUseVECTOR_CONCAT = true; for (auto Pair : DifferentValueMap) { // Check different values have same length which is NumElts / 2. if (Pair.second != NumElts / 2) canUseVECTOR_CONCAT = false; Vals.push_back(Pair.first); } // If canUseVECTOR_CONCAT is true, we can generate two DUPs and // CONCAT_VECTORs. For example, // // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23, // t24, t24, t24, t24, t24, t24, t24, t24 // ==> // t26: v8i8 = AArch64ISD::DUP t23 // t28: v8i8 = AArch64ISD::DUP t24 // t29: v16i8 = concat_vectors t26, t28 if (canUseVECTOR_CONCAT) { EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); if (isTypeLegal(SubVT) && SubVT.isVector() && SubVT.getVectorNumElements() >= 2) { SmallVector Ops1(NumElts / 2, Vals[0]); SmallVector Ops2(NumElts / 2, Vals[1]); SDValue DUP1 = LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG); SDValue DUP2 = LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG); SDValue CONCAT_VECTORS = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2); return CONCAT_VECTORS; } } // Let's try to generate VECTOR_SHUFFLE. For example, // // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26 // ==> // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28 if (NumElts >= 8) { SmallVector MaskVec; // Build mask for VECTOR_SHUFLLE. SDValue FirstLaneVal = Op.getOperand(0); for (unsigned i = 0; i < NumElts; ++i) { SDValue Val = Op.getOperand(i); if (FirstLaneVal == Val) MaskVec.push_back(i); else MaskVec.push_back(i + NumElts); } SmallVector Ops1(NumElts, Vals[0]); SmallVector Ops2(NumElts, Vals[1]); SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1); SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2); SDValue VECTOR_SHUFFLE = DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec); return VECTOR_SHUFFLE; } } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " "of INSERT_VECTOR_ELT\n"); SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); unsigned i = 0; // Use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register, and we're forced to emit an // INSERT_SUBREG that we can't fold anywhere. // // We also allow types like i8 and i16 which are illegal scalar but legal // vector element types. After type-legalization the inserted value is // extended (i32) and it is safe to cast them to the vector type by ignoring // the upper bits of the lowest lane (e.g. v8i8, v4i16). if (!Op0.isUndef()) { LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } LLVM_DEBUG(if (i < NumElts) dbgs() << "Creating nodes for the other vector elements:\n";); for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " "better alternative\n"); return SDValue(); } SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerFixedLengthConcatVectorsToSVE(Op, DAG); assert(Op.getValueType().isScalableVector() && isTypeLegal(Op.getValueType()) && "Expected legal scalable vector type!"); if (isTypeLegal(Op.getOperand(0).getValueType())) { unsigned NumOperands = Op->getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); if (NumOperands == 2) return Op; // Concat each pair of subvectors and pack into the lower half of the array. SmallVector ConcatOps(Op->op_begin(), Op->op_end()); while (ConcatOps.size() > 1) { for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { SDValue V1 = ConcatOps[I]; SDValue V2 = ConcatOps[I + 1]; EVT SubVT = V1.getValueType(); EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext()); ConcatOps[I / 2] = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2); } ConcatOps.resize(ConcatOps.size() / 2); } return ConcatOps[0]; } return SDValue(); } SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerFixedLengthInsertVectorElt(Op, DAG); EVT VT = Op.getOperand(0).getValueType(); if (VT.getScalarType() == MVT::i1) { EVT VectorVT = getPromotedVTForPredicate(VT); SDLoc DL(Op); SDValue ExtendedVector = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT); SDValue ExtendedValue = DAG.getAnyExtOrTrunc(Op.getOperand(1), DL, VectorVT.getScalarType().getSizeInBits() < 32 ? MVT::i32 : VectorVT.getScalarType()); ExtendedVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector, ExtendedValue, Op.getOperand(2)); return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); } // Check for non-constant or out of range lane. ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); return Op; } SDValue AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); EVT VT = Op.getOperand(0).getValueType(); if (VT.getScalarType() == MVT::i1) { // We can't directly extract from an SVE predicate; extend it first. // (This isn't the only possible lowering, but it's straightforward.) EVT VectorVT = getPromotedVTForPredicate(VT); SDLoc DL(Op); SDValue Extend = DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0)); MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32; SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, Extend, Op.getOperand(1)); return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); } if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthExtractVectorElt(Op, DAG); // Check for non-constant or out of range lane. ConstantSDNode *CI = dyn_cast(Op.getOperand(1)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16 || VT == MVT::v8bf16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && VT != MVT::v4bf16) return SDValue(); // For V64 types, we perform extraction by expanding the value // to a V128 type and perform the extraction on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); EVT ExtrTy = WideTy.getVectorElementType(); if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) ExtrTy = MVT::i32; // For extractions, we just return the result directly. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().isFixedLengthVector() && "Only cases that extract a fixed length vector are supported!"); EVT InVT = Op.getOperand(0).getValueType(); unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); unsigned Size = Op.getValueSizeInBits(); // If we don't have legal types yet, do nothing if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) return SDValue(); if (InVT.isScalableVector()) { // This will be matched by custom code during ISelDAGToDAG. if (Idx == 0 && isPackedVectorType(InVT, DAG)) return Op; return SDValue(); } // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. if (Idx == 0 && InVT.getSizeInBits() <= 128) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable()) return Op; if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) { SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); SDValue NewInVec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec, NewInVec, DAG.getConstant(Idx, DL, MVT::i64)); return convertFromScalableVector(DAG, Op.getValueType(), Splice); } return SDValue(); } SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().isScalableVector() && "Only expect to lower inserts into scalable vectors!"); EVT InVT = Op.getOperand(1).getValueType(); unsigned Idx = cast(Op.getOperand(2))->getZExtValue(); SDValue Vec0 = Op.getOperand(0); SDValue Vec1 = Op.getOperand(1); SDLoc DL(Op); EVT VT = Op.getValueType(); if (InVT.isScalableVector()) { if (!isTypeLegal(VT)) return SDValue(); // Break down insert_subvector into simpler parts. if (VT.getVectorElementType() == MVT::i1) { unsigned NumElts = VT.getVectorMinNumElements(); EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, DAG.getVectorIdxConstant(0, DL)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, DAG.getVectorIdxConstant(NumElts / 2, DL)); if (Idx < (NumElts / 2)) { SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1, DAG.getVectorIdxConstant(Idx, DL)); return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi); } else { SDValue NewHi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1, DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL)); return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi); } } // Ensure the subvector is half the size of the main vector. if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); // Here narrow and wide refers to the vector element types. After "casting" // both vectors must have the same bit length and so because the subvector // has fewer elements, those elements need to be bigger. EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount()); EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount()); // NOP cast operands to the largest legal vector of the same element count. if (VT.isFloatingPoint()) { Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG); Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG); } else { // Legal integer vectors are already their largest so Vec0 is fine as is. Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); } // To replace the top/bottom half of vector V with vector SubV we widen the // preserved half of V, concatenate this to SubV (the order depending on the // half being replaced) and then narrow the result. SDValue Narrow; if (Idx == 0) { SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0); } else { assert(Idx == InVT.getVectorMinNumElements() && "Invalid subvector index!"); SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1); } return getSVESafeBitCast(VT, Narrow, DAG); } if (Idx == 0 && isPackedVectorType(VT, DAG)) { // This will be matched by custom code during ISelDAGToDAG. if (Vec0.isUndef()) return Op; std::optional PredPattern = getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); auto PredTy = VT.changeVectorElementType(MVT::i1); SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern); SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); } return SDValue(); } static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) { if (Op.getOpcode() != AArch64ISD::DUP && Op.getOpcode() != ISD::SPLAT_VECTOR && Op.getOpcode() != ISD::BUILD_VECTOR) return false; if (Op.getOpcode() == ISD::BUILD_VECTOR && !isAllConstantBuildVector(Op, SplatVal)) return false; if (Op.getOpcode() != ISD::BUILD_VECTOR && !isa(Op->getOperand(0))) return false; SplatVal = Op->getConstantOperandVal(0); if (Op.getValueType().getVectorElementType() != MVT::i64) SplatVal = (int32_t)SplatVal; Negated = false; if (isPowerOf2_64(SplatVal)) return true; Negated = true; if (isPowerOf2_64(-SplatVal)) { SplatVal = -SplatVal; return true; } return false; } SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); assert(VT.isScalableVector() && "Expected a scalable vector."); bool Signed = Op.getOpcode() == ISD::SDIV; unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; bool Negated; uint64_t SplatVal; if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { SDValue Pg = getPredicateForScalableVector(DAG, dl, VT); SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0), DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32)); if (Negated) Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res); return Res; } if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) return LowerToPredicatedOp(Op, DAG, PredOpcode); // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit // operations, and truncate the result. EVT WidenedVT; if (VT == MVT::nxv16i8) WidenedVT = MVT::nxv8i16; else if (VT == MVT::nxv8i16) WidenedVT = MVT::nxv4i32; else llvm_unreachable("Unexpected Custom DIV operation"); unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); } bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { // Currently no fixed length shuffles that require SVE are legal. if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return false; if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned Cost = getPerfectShuffleCost(M); if (Cost <= 1) return true; } bool DummyBool; int DummyInt; unsigned DummyUnsigned; return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || isZIPMask(M, VT, DummyUnsigned) || isTRN_v_undef_Mask(M, VT, DummyUnsigned) || isUZP_v_undef_Mask(M, VT, DummyUnsigned) || isZIP_v_undef_Mask(M, VT, DummyUnsigned) || isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || isConcatMask(M, VT, VT.getSizeInBits() == 128)); } bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef M, EVT VT) const { // Just delegate to the generic legality, clear masks aren't special. return isShuffleMaskLegal(M, VT); } /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. The value must be in the range: /// 1 <= Value <= ElementBits for a right shift; or static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.getScalarType() == MVT::i1) { // Lower i1 truncate to `(x & 1) != 0`. SDLoc dl(Op); EVT OpVT = Op.getOperand(0).getValueType(); SDValue Zero = DAG.getConstant(0, dl, OpVT); SDValue One = DAG.getConstant(1, dl, OpVT); SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); } if (!VT.isVector() || VT.isScalableVector()) return SDValue(); if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), !Subtarget->isNeonAvailable())) return LowerFixedLengthVectorTruncateToSVE(Op, DAG); return SDValue(); } SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); int64_t Cnt; if (!Op.getOperand(1).getValueType().isVector()) return Op; unsigned EltSize = VT.getScalarSizeInBits(); switch (Op.getOpcode()) { case ISD::SHL: if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED : AArch64ISD::SRL_PRED; return LowerToPredicatedOp(Op, DAG, Opc); } // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); } // Right shift register. Note, there is not a shift right register // instruction, but the shift left register instruction takes a signed // value, where negative numbers specify a right shift. unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl : Intrinsic::aarch64_neon_ushl; // negate the shift amount SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op.getOperand(1)); SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), NegShift); return NegShiftLeft; } llvm_unreachable("unexpected shift opcode"); } static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); bool IsZero = IsCnst && (CnstBits == 0); if (SrcVT.getVectorElementType().isFloatingPoint()) { switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Fcmeq; if (IsZero) Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); else Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); return DAG.getNOT(dl, Fcmeq, VT); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); case AArch64CC::LE: if (!NoNans) return SDValue(); // If we ignore NaNs then we can use to the LS implementation. [[fallthrough]]; case AArch64CC::LS: if (IsZero) return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); case AArch64CC::LT: if (!NoNans) return SDValue(); // If we ignore NaNs then we can use to the MI implementation. [[fallthrough]]; case AArch64CC::MI: if (IsZero) return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); } } switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Cmeq; if (IsZero) Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); else Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); return DAG.getNOT(dl, Cmeq, VT); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); case AArch64CC::LE: if (IsZero) return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); case AArch64CC::LS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); case AArch64CC::LO: return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); case AArch64CC::LT: if (IsZero) return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); case AArch64CC::HI: return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); case AArch64CC::HS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); } } SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), !Subtarget->isNeonAvailable())) return LowerFixedLengthVectorSetccToSVE(Op, DAG); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); if (LHS.getValueType().getVectorElementType().isInteger()) { assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); SDValue Cmp = EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } const bool FullFP16 = DAG.getSubtarget().hasFullFP16(); // Make v4f16 (only) fcmp operations utilise vector instructions // v8f16 support will be a litle more complicated if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { if (LHS.getValueType().getVectorNumElements() == 4) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); DAG.ReplaceAllUsesWith(Op, NewSetcc); CmpVT = MVT::v4i32; } else return SDValue(); } assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || LHS.getValueType().getVectorElementType() != MVT::f128); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. AArch64CC::CondCode CC1, CC2; bool ShouldInvert; changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); SDValue Cmp = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); } Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); if (ShouldInvert) Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); return Cmp; } static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG) { SDValue VecOp = ScalarOp.getOperand(0); auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, DAG.getConstant(0, DL, MVT::i64)); } static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG) { unsigned ScalarOpcode; switch (Opcode) { case ISD::VECREDUCE_AND: ScalarOpcode = ISD::AND; break; case ISD::VECREDUCE_OR: ScalarOpcode = ISD::OR; break; case ISD::VECREDUCE_XOR: ScalarOpcode = ISD::XOR; break; default: llvm_unreachable("Expected bitwise vector reduction"); return SDValue(); } EVT VecVT = Vec.getValueType(); assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() && "Expected power-of-2 length vector"); EVT ElemVT = VecVT.getVectorElementType(); SDValue Result; unsigned NumElems = VecVT.getVectorNumElements(); // Special case for boolean reductions if (ElemVT == MVT::i1) { // Split large vectors into smaller ones if (NumElems > 16) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); EVT HalfVT = Lo.getValueType(); SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi); return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG); } // Vectors that are less than 64 bits get widened to neatly fit a 64 bit // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to // this element size leads to the best codegen, since e.g. setcc results // might need to be truncated otherwise. EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u)); // any_ext doesn't work with umin/umax, so only use it for uadd. unsigned ExtendOp = ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND; SDValue Extended = DAG.getNode( ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec); switch (ScalarOpcode) { case ISD::AND: Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended); break; case ISD::OR: Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended); break; case ISD::XOR: Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended); break; default: llvm_unreachable("Unexpected Opcode"); } Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1); } else { // Iteratively split the vector in half and combine using the bitwise // operation until it fits in a 64 bit register. while (VecVT.getSizeInBits() > 64) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); VecVT = Lo.getValueType(); NumElems = VecVT.getVectorNumElements(); Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi); } EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits()); // Do the remaining work on a scalar since it allows the code generator to // combine the shift and bitwise operation into one instruction and since // integer instructions can have higher throughput than vector instructions. SDValue Scalar = DAG.getBitcast(ScalarVT, Vec); // Iteratively combine the lower and upper halves of the scalar using the // bitwise operation, halving the relevant region of the scalar in each // iteration, until the relevant region is just one element of the original // vector. for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) { SDValue ShiftAmount = DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64); SDValue Shifted = DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount); Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted); } Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT); } return DAG.getAnyExtOrTrunc(Result, DL, VT); } SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); // Try to lower fixed length reductions to SVE. EVT SrcVT = Src.getValueType(); bool OverrideNEON = !Subtarget->isNeonAvailable() || Op.getOpcode() == ISD::VECREDUCE_AND || Op.getOpcode() == ISD::VECREDUCE_OR || Op.getOpcode() == ISD::VECREDUCE_XOR || Op.getOpcode() == ISD::VECREDUCE_FADD || (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); if (SrcVT.isScalableVector() || useSVEForFixedLengthVectorVT( SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) { if (SrcVT.getVectorElementType() == MVT::i1) return LowerPredReductionToSVE(Op, DAG); switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); case ISD::VECREDUCE_AND: return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); case ISD::VECREDUCE_OR: return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); case ISD::VECREDUCE_SMAX: return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); case ISD::VECREDUCE_SMIN: return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); case ISD::VECREDUCE_UMAX: return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); case ISD::VECREDUCE_UMIN: return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); case ISD::VECREDUCE_XOR: return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); case ISD::VECREDUCE_FADD: return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); case ISD::VECREDUCE_FMAX: return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); case ISD::VECREDUCE_FMIN: return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); case ISD::VECREDUCE_FMAXIMUM: return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG); case ISD::VECREDUCE_FMINIMUM: return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG); default: llvm_unreachable("Unhandled fixed length reduction"); } } // Lower NEON reductions. SDLoc dl(Op); switch (Op.getOpcode()) { case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0), Op.getValueType(), dl, DAG); case ISD::VECREDUCE_ADD: return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); case ISD::VECREDUCE_SMAX: return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); case ISD::VECREDUCE_SMIN: return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); case ISD::VECREDUCE_UMAX: return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); case ISD::VECREDUCE_UMIN: return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); default: llvm_unreachable("Unhandled reduction"); } } SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = DAG.getSubtarget(); if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) return SDValue(); // LSE has an atomic load-add instruction, but not a load-sub. SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue RHS = Op.getOperand(2); AtomicSDNode *AN = cast(Op.getNode()); RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), Op.getOperand(0), Op.getOperand(1), RHS, AN->getMemOperand()); } SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = DAG.getSubtarget(); // No point replacing if we don't have the relevant instruction/libcall anyway if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) return SDValue(); // LSE has an atomic load-clear instruction, but not a load-and. SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); assert(VT != MVT::i128 && "Handled elsewhere, code replicated."); SDValue RHS = Op.getOperand(2); AtomicSDNode *AN = cast(Op.getNode()); RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), Op.getOperand(0), Op.getOperand(1), RHS, AN->getMemOperand()); } SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), PtrVT, 0); const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); // To match the actual intent better, we should read the output from X15 here // again (instead of potentially spilling it to the stack), but rereading Size // from X15 here doesn't work at -O0, since it thinks that X15 is undefined // here. Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); return Chain; } // When x and y are extended, lower: // avgfloor(x, y) -> (x + y) >> 1 // avgceil(x, y) -> (x + y + 1) >> 1 // Otherwise, lower to: // avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1) // avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1) SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const { if (Subtarget->hasSVE2()) return LowerToPredicatedOp(Op, DAG, NewOp); SDLoc dl(Op); SDValue OpA = Op->getOperand(0); SDValue OpB = Op->getOperand(1); EVT VT = Op.getValueType(); bool IsCeil = (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU); bool IsSigned = (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS); unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL; assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); auto IsZeroExtended = [&DAG](SDValue &Node) { KnownBits Known = DAG.computeKnownBits(Node, 0); return Known.Zero.isSignBitSet(); }; auto IsSignExtended = [&DAG](SDValue &Node) { return (DAG.ComputeNumSignBits(Node, 0) > 1); }; SDValue ConstantOne = DAG.getConstant(1, dl, VT); if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) || (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) { SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB); if (IsCeil) Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne); return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne); } SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne); SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne); SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB); tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB); return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); } SDValue AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Only Windows alloca probing supported"); SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); MaybeAlign Align = cast(Op.getOperand(2))->getMaybeAlignValue(); EVT VT = Node->getValueType(0); if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT != MVT::i64 && "Expected illegal VSCALE node"); SDLoc DL(Op); APInt MulImm = cast(Op.getOperand(0))->getAPIntValue(); return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL, VT); } /// Set the IntrinsicInfo for the `aarch64_sve_st` intrinsics. template static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { Info.opc = ISD::INTRINSIC_VOID; // Retrieve EC from first vector argument. const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); ElementCount EC = VT.getVectorElementCount(); #ifndef NDEBUG // Check the assumption that all input vectors are the same type. for (unsigned I = 0; I < NumVecs; ++I) assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && "Invalid type."); #endif // memVT is `NumVecs * VT`. Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), EC * NumVecs); Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1); Info.offset = 0; Info.align.reset(); Info.flags = MachineMemOperand::MOStore; return true; } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_sve_st2: return setInfoSVEStN<2>(*this, DL, Info, I); case Intrinsic::aarch64_sve_st3: return setInfoSVEStN<3>(*this, DL, Info, I); case Intrinsic::aarch64_sve_st4: return setInfoSVEStN<4>(*this, DL, Info, I); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: - case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch64_neon_ld1x4: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.arg_size() - 1); + Info.offset = 0; + Info.align.reset(); + // volatile loads with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; + return true; + } case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; - // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; - Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + // ldx return struct with the same vec type + Type *RetTy = I.getType(); + auto *StructTy = cast(RetTy); + unsigned NumElts = StructTy->getNumElements(); + Type *VecTy = StructTy->getElementType(0); + MVT EleVT = MVT::getVT(VecTy).getVectorElementType(); + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts); Info.ptrVal = I.getArgOperand(I.arg_size() - 1); Info.offset = 0; Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: - case Intrinsic::aarch64_neon_st1x4: + case Intrinsic::aarch64_neon_st1x4: { + Info.opc = ISD::INTRINSIC_VOID; + unsigned NumElts = 0; + for (const Value *Arg : I.args()) { + Type *ArgTy = Arg->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.arg_size() - 1); + Info.offset = 0; + Info.align.reset(); + // volatile stores with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: { Info.opc = ISD::INTRINSIC_VOID; - // Conservatively set memVT to the entire set of vectors stored. unsigned NumElts = 0; + // all the vector type is same + Type *VecTy = I.getArgOperand(0)->getType(); + MVT EleVT = MVT::getVT(VecTy).getVectorElementType(); + for (const Value *Arg : I.args()) { Type *ArgTy = Arg->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + NumElts += 1; } - Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts); Info.ptrVal = I.getArgOperand(I.arg_size() - 1); Info.offset = 0; Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { Type *ValTy = I.getParamElementType(0); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(ValTy); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlign(ValTy); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { Type *ValTy = I.getParamElementType(1); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(ValTy); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlign(ValTy); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_ldaxp: case Intrinsic::aarch64_ldxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = Align(16); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_sve_ldnt1: { Type *ElTy = cast(I.getType())->getElementType(); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlign(ElTy); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; return true; } case Intrinsic::aarch64_sve_stnt1: { Type *ElTy = cast(I.getArgOperand(0)->getType())->getElementType(); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(I.getOperand(0)->getType()); Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = DL.getABITypeAlign(ElTy); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; return true; } case Intrinsic::aarch64_mops_memset_tag: { Value *Dst = I.getArgOperand(0); Value *Val = I.getArgOperand(1); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(Val->getType()); Info.ptrVal = Dst; Info.offset = 0; Info.align = I.getParamAlign(0).valueOrOne(); Info.flags = MachineMemOperand::MOStore; // The size of the memory being operated on is unknown at this point Info.size = MemoryLocation::UnknownSize; return true; } default: break; } return false; } bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // TODO: This may be worth removing. Check regression tests for diffs. if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) return false; // If we're reducing the load width in order to avoid having to use an extra // instruction to do extension then it's probably a good idea. if (ExtTy != ISD::NON_EXTLOAD) return true; // Don't reduce load width if it would prevent us from combining a shift into // the offset. MemSDNode *Mem = dyn_cast(Load); assert(Mem); const SDValue &Base = Mem->getBasePtr(); if (Base.getOpcode() == ISD::ADD && Base.getOperand(1).getOpcode() == ISD::SHL && Base.getOperand(1).hasOneUse() && Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { // It's unknown whether a scalable vector has a power-of-2 bitwidth. if (Mem->getMemoryVT().isScalableVector()) return false; // The shift can be combined if it matches the size of the value being // loaded (and so reducing the width would make it not match). uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; if (ShiftAmount == Log2_32(LoadBytes)) return false; } // We have no reason to disallow reducing the load width, so allow it. return true; } // Treat a sext_inreg(extract(..)) as free if it has multiple uses. bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const { EVT VT = Extend.getValueType(); if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) { SDValue Extract = Extend.getOperand(0); if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse()) Extract = Extract.getOperand(0); if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) { EVT VecVT = Extract.getOperand(0).getValueType(); if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16) return false; } } return true; } // Truncations from 64-bit GPR to 32-bit GPR is free. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue(); uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue(); return NumBits1 > NumBits2; } bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; uint64_t NumBits1 = VT1.getFixedSizeInBits(); uint64_t NumBits2 = VT2.getFixedSizeInBits(); return NumBits1 > NumBits2; } /// Check if it is profitable to hoist instruction in then/else to if. /// Not profitable if I and it's user can form a FMA instruction /// because we prefer FMSUB/FMADD. bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); if (!(User->getOpcode() == Instruction::FSub || User->getOpcode() == Instruction::FAdd)) return true; const TargetOptions &Options = getTargetMachine().Options; const Function *F = I->getFunction(); const DataLayout &DL = F->getParent()->getDataLayout(); Type *Ty = User->getOperand(0)->getType(); return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) { return true; } if (Val.getOpcode() != ISD::LOAD) return false; // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && VT1.getSizeInBits() <= 32); } bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { if (isa(Ext)) return false; // Vector types are not free. if (Ext->getType()->isVectorTy()) return false; for (const Use &U : Ext->uses()) { // The extension is free if we can fold it with a left shift in an // addressing mode or an arithmetic operation: add, sub, and cmp. // Is there a shift? const Instruction *Instr = cast(U.getUser()); // Is this a constant shift? switch (Instr->getOpcode()) { case Instruction::Shl: if (!isa(Instr->getOperand(1))) return false; break; case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); auto &DL = Ext->getModule()->getDataLayout(); std::advance(GTI, U.getOperandNo()-1); Type *IdxTy = GTI.getIndexedType(); // This extension will end up with a shift because of the scaling factor. // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). if (IdxTy->isScalableTy()) return false; uint64_t ShiftAmt = llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) return false; break; } case Instruction::Trunc: // Check if this is a noop. // trunc(sext ty1 to ty2) to ty1. if (Instr->getType() == Ext->getOperand(0)->getType()) continue; [[fallthrough]]; default: return false; } // At this point we can use the bfm family, so this extension is free // for that use. } return true; } static bool isSplatShuffle(Value *V) { if (auto *Shuf = dyn_cast(V)) return all_equal(Shuf->getShuffleMask()); return false; } /// Check if both Op1 and Op2 are shufflevector extracts of either the lower /// or upper half of the vector elements. static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat = false) { auto areTypesHalfed = [](Value *FullV, Value *HalfV) { auto *FullTy = FullV->getType(); auto *HalfTy = HalfV->getType(); return FullTy->getPrimitiveSizeInBits().getFixedValue() == 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); }; auto extractHalf = [](Value *FullV, Value *HalfV) { auto *FullVT = cast(FullV->getType()); auto *HalfVT = cast(HalfV->getType()); return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); }; ArrayRef M1, M2; Value *S1Op1 = nullptr, *S2Op1 = nullptr; if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) return false; // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that // it is not checked as an extract below. if (AllowSplat && isSplatShuffle(Op1)) S1Op1 = nullptr; if (AllowSplat && isSplatShuffle(Op2)) S2Op1 = nullptr; // Check that the operands are half as wide as the result and we extract // half of the elements of the input vectors. if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) return false; // Check the mask extracts either the lower or upper half of vector // elements. int M1Start = 0; int M2Start = 0; int NumElements = cast(Op1->getType())->getNumElements() * 2; if ((S1Op1 && !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || (S2Op1 && !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) return false; if ((M1Start != 0 && M1Start != (NumElements / 2)) || (M2Start != 0 && M2Start != (NumElements / 2))) return false; if (S1Op1 && S2Op1 && M1Start != M2Start) return false; return true; } /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth /// of the vector elements. static bool areExtractExts(Value *Ext1, Value *Ext2) { auto areExtDoubled = [](Instruction *Ext) { return Ext->getType()->getScalarSizeInBits() == 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); }; if (!match(Ext1, m_ZExtOrSExt(m_Value())) || !match(Ext2, m_ZExtOrSExt(m_Value())) || !areExtDoubled(cast(Ext1)) || !areExtDoubled(cast(Ext2))) return false; return true; } /// Check if Op could be used with vmull_high_p64 intrinsic. static bool isOperandOfVmullHighP64(Value *Op) { Value *VectorOperand = nullptr; ConstantInt *ElementIndex = nullptr; return match(Op, m_ExtractElt(m_Value(VectorOperand), m_ConstantInt(ElementIndex))) && ElementIndex->getValue() == 1 && isa(VectorOperand->getType()) && cast(VectorOperand->getType())->getNumElements() == 2; } /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); } /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). bool AArch64TargetLowering::shouldSinkOperands( Instruction *I, SmallVectorImpl &Ops) const { if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), /*AllowSplat=*/true)) { Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); return true; } [[fallthrough]]; case Intrinsic::fma: if (isa(I->getType()) && cast(I->getType())->getElementType()->isHalfTy() && !Subtarget->hasFullFP16()) return false; [[fallthrough]]; case Intrinsic::aarch64_neon_sqdmull: case Intrinsic::aarch64_neon_sqdmulh: case Intrinsic::aarch64_neon_sqrdmulh: // Sink splats for index lane variants if (isSplatShuffle(II->getOperand(0))) Ops.push_back(&II->getOperandUse(0)); if (isSplatShuffle(II->getOperand(1))) Ops.push_back(&II->getOperandUse(1)); return !Ops.empty(); case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: if (auto *IIOp = dyn_cast(II->getOperand(0))) if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) Ops.push_back(&II->getOperandUse(0)); return !Ops.empty(); case Intrinsic::aarch64_sme_write_horiz: case Intrinsic::aarch64_sme_write_vert: case Intrinsic::aarch64_sme_writeq_horiz: case Intrinsic::aarch64_sme_writeq_vert: { auto *Idx = dyn_cast(II->getOperand(1)); if (!Idx || Idx->getOpcode() != Instruction::Add) return false; Ops.push_back(&II->getOperandUse(1)); return true; } case Intrinsic::aarch64_sme_read_horiz: case Intrinsic::aarch64_sme_read_vert: case Intrinsic::aarch64_sme_readq_horiz: case Intrinsic::aarch64_sme_readq_vert: case Intrinsic::aarch64_sme_ld1b_vert: case Intrinsic::aarch64_sme_ld1h_vert: case Intrinsic::aarch64_sme_ld1w_vert: case Intrinsic::aarch64_sme_ld1d_vert: case Intrinsic::aarch64_sme_ld1q_vert: case Intrinsic::aarch64_sme_st1b_vert: case Intrinsic::aarch64_sme_st1h_vert: case Intrinsic::aarch64_sme_st1w_vert: case Intrinsic::aarch64_sme_st1d_vert: case Intrinsic::aarch64_sme_st1q_vert: case Intrinsic::aarch64_sme_ld1b_horiz: case Intrinsic::aarch64_sme_ld1h_horiz: case Intrinsic::aarch64_sme_ld1w_horiz: case Intrinsic::aarch64_sme_ld1d_horiz: case Intrinsic::aarch64_sme_ld1q_horiz: case Intrinsic::aarch64_sme_st1b_horiz: case Intrinsic::aarch64_sme_st1h_horiz: case Intrinsic::aarch64_sme_st1w_horiz: case Intrinsic::aarch64_sme_st1d_horiz: case Intrinsic::aarch64_sme_st1q_horiz: { auto *Idx = dyn_cast(II->getOperand(3)); if (!Idx || Idx->getOpcode() != Instruction::Add) return false; Ops.push_back(&II->getOperandUse(3)); return true; } case Intrinsic::aarch64_neon_pmull: if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) return false; Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); return true; case Intrinsic::aarch64_neon_pmull64: if (!areOperandsOfVmullHighP64(II->getArgOperand(0), II->getArgOperand(1))) return false; Ops.push_back(&II->getArgOperandUse(0)); Ops.push_back(&II->getArgOperandUse(1)); return true; default: return false; } } if (!I->getType()->isVectorTy()) return false; switch (I->getOpcode()) { case Instruction::Sub: case Instruction::Add: { if (!areExtractExts(I->getOperand(0), I->getOperand(1))) return false; // If the exts' operands extract either the lower or upper elements, we // can sink them too. auto Ext1 = cast(I->getOperand(0)); auto Ext2 = cast(I->getOperand(1)); if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { Ops.push_back(&Ext1->getOperandUse(0)); Ops.push_back(&Ext2->getOperandUse(0)); } Ops.push_back(&I->getOperandUse(0)); Ops.push_back(&I->getOperandUse(1)); return true; } case Instruction::Or: { // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) if (Subtarget->hasNEON()) { Instruction *OtherAnd, *IA, *IB; Value *MaskValue; // MainAnd refers to And instruction that has 'Not' as one of its operands if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), m_Instruction(IA)))))) { if (match(OtherAnd, m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { Instruction *MainAnd = I->getOperand(0) == OtherAnd ? cast(I->getOperand(1)) : cast(I->getOperand(0)); // Both Ands should be in same basic block as Or if (I->getParent() != MainAnd->getParent() || I->getParent() != OtherAnd->getParent()) return false; // Non-mask operands of both Ands should also be in same basic block if (I->getParent() != IA->getParent() || I->getParent() != IB->getParent()) return false; Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); Ops.push_back(&I->getOperandUse(0)); Ops.push_back(&I->getOperandUse(1)); return true; } } } return false; } case Instruction::Mul: { int NumZExts = 0, NumSExts = 0; for (auto &Op : I->operands()) { // Make sure we are not already sinking this operand if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; if (match(&Op, m_SExt(m_Value()))) { NumSExts++; continue; } else if (match(&Op, m_ZExt(m_Value()))) { NumZExts++; continue; } ShuffleVectorInst *Shuffle = dyn_cast(Op); // If the Shuffle is a splat and the operand is a zext/sext, sinking the // operand and the s/zext can help create indexed s/umull. This is // especially useful to prevent i64 mul being scalarized. if (Shuffle && isSplatShuffle(Shuffle) && match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) NumSExts++; else NumZExts++; continue; } if (!Shuffle) continue; Value *ShuffleOperand = Shuffle->getOperand(0); InsertElementInst *Insert = dyn_cast(ShuffleOperand); if (!Insert) continue; Instruction *OperandInstr = dyn_cast(Insert->getOperand(1)); if (!OperandInstr) continue; ConstantInt *ElementConstant = dyn_cast(Insert->getOperand(2)); // Check that the insertelement is inserting into element 0 if (!ElementConstant || !ElementConstant->isZero()) continue; unsigned Opcode = OperandInstr->getOpcode(); if (Opcode == Instruction::SExt) NumSExts++; else if (Opcode == Instruction::ZExt) NumZExts++; else { // If we find that the top bits are known 0, then we can sink and allow // the backend to generate a umull. unsigned Bitwidth = I->getType()->getScalarSizeInBits(); APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); const DataLayout &DL = I->getFunction()->getParent()->getDataLayout(); if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) continue; NumZExts++; } Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); } // Is it profitable to sink if we found two of the same type of extends. return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); } default: return false; } return false; } static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian) { Value *Op = ZExt->getOperand(0); auto *SrcTy = cast(Op->getType()); auto SrcWidth = cast(SrcTy->getElementType())->getBitWidth(); auto DstWidth = cast(DstTy->getElementType())->getBitWidth(); if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64) return false; assert(DstWidth % SrcWidth == 0 && "TBL lowering is not supported for a ZExt instruction with this " "source & destination element type."); unsigned ZExtFactor = DstWidth / SrcWidth; unsigned NumElts = SrcTy->getNumElements(); IRBuilder<> Builder(ZExt); SmallVector Mask; // Create a mask that selects <0,...,Op[i]> for each lane of the destination // vector to replace the original ZExt. This can later be lowered to a set of // tbl instructions. for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { if (IsLittleEndian) { if (i % ZExtFactor == 0) Mask.push_back(i / ZExtFactor); else Mask.push_back(NumElts); } else { if ((i + 1) % ZExtFactor == 0) Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); else Mask.push_back(NumElts); } } auto *FirstEltZero = Builder.CreateInsertElement( PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); Result = Builder.CreateBitCast(Result, DstTy); if (DstTy != ZExt->getType()) Result = Builder.CreateZExt(Result, ZExt->getType()); ZExt->replaceAllUsesWith(Result); ZExt->eraseFromParent(); return true; } static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { IRBuilder<> Builder(TI); SmallVector Parts; int NumElements = cast(TI->getType())->getNumElements(); auto *SrcTy = cast(TI->getOperand(0)->getType()); auto *DstTy = cast(TI->getType()); assert(SrcTy->getElementType()->isIntegerTy() && "Non-integer type source vector element is not supported"); assert(DstTy->getElementType()->isIntegerTy(8) && "Unsupported destination vector element type"); unsigned SrcElemTySz = cast(SrcTy->getElementType())->getBitWidth(); unsigned DstElemTySz = cast(DstTy->getElementType())->getBitWidth(); assert((SrcElemTySz % DstElemTySz == 0) && "Cannot lower truncate to tbl instructions for a source element size " "that is not divisible by the destination element size"); unsigned TruncFactor = SrcElemTySz / DstElemTySz; assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) && "Unsupported source vector element type size"); Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); // Create a mask to choose every nth byte from the source vector table of // bytes to create the truncated destination vector, where 'n' is the truncate // ratio. For example, for a truncate from Yxi64 to Yxi8, choose // 0,8,16,..Y*8th bytes for the little-endian format SmallVector MaskConst; for (int Itr = 0; Itr < 16; Itr++) { if (Itr < NumElements) MaskConst.push_back(Builder.getInt8( IsLittleEndian ? Itr * TruncFactor : Itr * TruncFactor + (TruncFactor - 1))); else MaskConst.push_back(Builder.getInt8(255)); } int MaxTblSz = 128 * 4; int MaxSrcSz = SrcElemTySz * NumElements; int ElemsPerTbl = (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz); assert(ElemsPerTbl <= 16 && "Maximum elements selected using TBL instruction cannot exceed 16!"); int ShuffleCount = 128 / SrcElemTySz; SmallVector ShuffleLanes; for (int i = 0; i < ShuffleCount; ++i) ShuffleLanes.push_back(i); // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated, // call TBL & save the result in a vector of TBL results for combining later. SmallVector Results; while (ShuffleLanes.back() < NumElements) { Parts.push_back(Builder.CreateBitCast( Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); if (Parts.size() == 4) { auto *F = Intrinsic::getDeclaration(TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); Results.push_back(Builder.CreateCall(F, Parts)); Parts.clear(); } for (int i = 0; i < ShuffleCount; ++i) ShuffleLanes[i] += ShuffleCount; } assert((Parts.empty() || Results.empty()) && "Lowering trunc for vectors requiring different TBL instructions is " "not supported!"); // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD // registers if (!Parts.empty()) { Intrinsic::ID TblID; switch (Parts.size()) { case 1: TblID = Intrinsic::aarch64_neon_tbl1; break; case 2: TblID = Intrinsic::aarch64_neon_tbl2; break; case 3: TblID = Intrinsic::aarch64_neon_tbl3; break; } auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); Results.push_back(Builder.CreateCall(F, Parts)); } // Extract the destination vector from TBL result(s) after combining them // where applicable. Currently, at most two TBLs are supported. assert(Results.size() <= 2 && "Trunc lowering does not support generation of " "more than 2 tbl instructions!"); Value *FinalResult = Results[0]; if (Results.size() == 1) { if (ElemsPerTbl < 16) { SmallVector FinalMask(ElemsPerTbl); std::iota(FinalMask.begin(), FinalMask.end(), 0); FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask); } } else { SmallVector FinalMask(ElemsPerTbl * Results.size()); if (ElemsPerTbl < 16) { std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0); std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16); } else { std::iota(FinalMask.begin(), FinalMask.end(), 0); } FinalResult = Builder.CreateShuffleVector(Results[0], Results[1], FinalMask); } TI->replaceAllUsesWith(FinalResult); TI->eraseFromParent(); } bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( Instruction *I, Loop *L, const TargetTransformInfo &TTI) const { // shuffle_vector instructions are serialized when targeting SVE, // see LowerSPLAT_VECTOR. This peephole is not beneficial. if (Subtarget->useSVEForFixedLengthVectors()) return false; // Try to optimize conversions using tbl. This requires materializing constant // index vectors, which can increase code size and add loads. Skip the // transform unless the conversion is in a loop block guaranteed to execute // and we are not optimizing for size. Function *F = I->getParent()->getParent(); if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || F->hasOptSize()) return false; auto *SrcTy = dyn_cast(I->getOperand(0)->getType()); auto *DstTy = dyn_cast(I->getType()); if (!SrcTy || !DstTy) return false; // Convert 'zext %x to ' to a shuffle that can be // lowered to tbl instructions to insert the original i8 elements // into i8x lanes. This is enabled for cases where it is beneficial. auto *ZExt = dyn_cast(I); if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) { auto DstWidth = DstTy->getElementType()->getScalarSizeInBits(); if (DstWidth % 8 != 0) return false; auto *TruncDstType = cast(VectorType::getTruncatedElementVectorType(DstTy)); // If the ZExt can be lowered to a single ZExt to the next power-of-2 and // the remaining ZExt folded into the user, don't use tbl lowering. auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits(); if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType, TargetTransformInfo::getCastContextHint(I), TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) { if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits()) return false; DstTy = TruncDstType; } return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian()); } auto *UIToFP = dyn_cast(I); if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && DstTy->getElementType()->isFloatTy()) { IRBuilder<> Builder(I); auto *ZExt = cast( Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy))); auto *UI = Builder.CreateUIToFP(ZExt, DstTy); I->replaceAllUsesWith(UI); I->eraseFromParent(); return createTblShuffleForZExt(ZExt, cast(ZExt->getType()), Subtarget->isLittleEndian()); } // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui // followed by a truncate lowered to using tbl.4. auto *FPToUI = dyn_cast(I); if (FPToUI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && SrcTy->getElementType()->isFloatTy() && DstTy->getElementType()->isIntegerTy(8)) { IRBuilder<> Builder(I); auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0), VectorType::getInteger(SrcTy)); auto *TruncI = Builder.CreateTrunc(WideConv, DstTy); I->replaceAllUsesWith(TruncI); I->eraseFromParent(); createTblForTrunc(cast(TruncI), Subtarget->isLittleEndian()); return true; } // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate // tbl instruction selecting the lowest/highest (little/big endian) 8 bits // per lane of the input that is represented using 1,2,3 or 4 128-bit table // registers auto *TI = dyn_cast(I); if (TI && DstTy->getElementType()->isIntegerTy(8) && ((SrcTy->getElementType()->isIntegerTy(32) || SrcTy->getElementType()->isIntegerTy(64)) && (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) { createTblForTrunc(TI, Subtarget->isLittleEndian()); return true; } return false; } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. RequiredAligment = Align(1); unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned AArch64TargetLowering::getNumInterleavedAccesses( VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { unsigned VecSize = 128; unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); if (UseScalable) VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); return std::max(1, (MinElts * ElSize + 127) / VecSize); } MachineMemOperand::Flags AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) return MOStridedAccess; return MachineMemOperand::MONone; } bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); auto EC = VecTy->getElementCount(); unsigned MinElts = EC.getKnownMinValue(); UseScalable = false; if (!VecTy->isScalableTy() && !Subtarget->hasNEON()) return false; if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME()) return false; // Ensure that the predicate for this number of elements is available. if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts)) return false; // Ensure the number of vector elements is greater than 1. if (MinElts < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; if (EC.isScalable()) { UseScalable = true; return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0; } unsigned VecSize = DL.getTypeSizeInBits(VecTy); if (!Subtarget->isNeonAvailable() || (Subtarget->useSVEForFixedLengthVectors() && (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || (VecSize < Subtarget->getMinSVEVectorSizeInBits() && isPowerOf2_32(MinElts) && VecSize > 128)))) { UseScalable = true; return true; } // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. return VecSize == 64 || VecSize % 128 == 0; } static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) { if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 2); if (VTy->getElementType() == Type::getFloatTy(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 4); if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 8); if (VTy->getElementType() == Type::getHalfTy(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 8); if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 2); if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 4); if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 8); if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext())) return ScalableVectorType::get(VTy->getElementType(), 16); llvm_unreachable("Cannot handle input vector type"); } static Function *getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy) { assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor"); static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret, Intrinsic::aarch64_sve_ld4_sret}; static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; if (Scalable) return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy}); return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy}); } static Function *getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy) { assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor"); static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2, Intrinsic::aarch64_sve_st3, Intrinsic::aarch64_sve_st4}; static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2, Intrinsic::aarch64_neon_st3, Intrinsic::aarch64_neon_st4}; if (Scalable) return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy}); return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy}); } /// Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. bool UseScalable; if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); auto *FVTy = cast(VTy); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = FVTy->getElementType(); if (EltTy->isPointerTy()) FVTy = FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); // If we're going to generate more than one load, reset the sub-vector type // to something legal. FVTy = FixedVectorType::get(FVTy->getElementType(), FVTy->getNumElements() / NumLoads); auto *LDVTy = UseScalable ? cast(getSVEContainerIRType(FVTy)) : FVTy; IRBuilder<> Builder(LI); // The base address of the load. Value *BaseAddr = LI->getPointerOperand(); if (NumLoads > 1) { // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } Type *PtrTy = LI->getPointerOperandType(); Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()), LDVTy->getElementCount()); Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor, UseScalable, LDVTy, PtrTy); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will // replace. DenseMap> SubVecs; Value *PTrue = nullptr; if (UseScalable) { std::optional PgPattern = getSVEPredPatternFromNumElements(FVTy->getNumElements()); if (Subtarget->getMinSVEVectorSizeInBits() == Subtarget->getMaxSVEVectorSizeInBits() && Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy)) PgPattern = AArch64SVEPredPattern::all; auto *PTruePat = ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern); PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {PTruePat}); } for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr, FVTy->getNumElements() * Factor); CallInst *LdN; if (UseScalable) LdN = Builder.CreateCall( LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN"); else LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); // Extract and store the sub-vectors returned by the load intrinsic. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SVI = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(LdN, Index); if (UseScalable) SubVec = Builder.CreateExtractVector( FVTy, SubVec, ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0)); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( SubVec, FixedVectorType::get(SVI->getType()->getElementType(), FVTy->getNumElements())); SubVecs[SVI].push_back(SubVec); } } // Replace uses of the shufflevector instructions with the sub-vectors // returned by the load intrinsic. If a shufflevector instruction is // associated with more than one sub-vector, those sub-vectors will be // concatenated into a single wide vector. for (ShuffleVectorInst *SVI : Shuffles) { auto &SubVec = SubVecs[SVI]; auto *WideVec = SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; SVI->replaceAllUsesWith(WideVec); } return true; } /// Lower an interleaved store into a stN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. /// /// Example for a more general valid mask (Factor 3). Lower: /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned LaneLen = VecTy->getNumElements() / Factor; Type *EltTy = VecTy->getElementType(); auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); bool UseScalable; // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable); Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); unsigned NumOpElts = cast(Op0->getType())->getNumElements(); // Convert to the corresponding integer vector. auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = FixedVectorType::get(IntTy, LaneLen); } // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); auto *STVTy = UseScalable ? cast(getSVEContainerIRType(SubVecTy)) : SubVecTy; // The base address of the store. Value *BaseAddr = SI->getPointerOperand(); if (NumStores > 1) { // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); } auto Mask = SVI->getShuffleMask(); // Sanity check if all the indices are NOT in range. // If mask is `poison`, `Mask` may be a vector of -1s. // If all of them are `poison`, OOB read will happen later. if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) { return false; } // A 64bit st2 which does not start at element 0 will involved adding extra // ext elements, making the st2 unprofitable. if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0) return false; Type *PtrTy = SI->getPointerOperandType(); Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()), STVTy->getElementCount()); Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor, UseScalable, STVTy, PtrTy); Value *PTrue = nullptr; if (UseScalable) { std::optional PgPattern = getSVEPredPatternFromNumElements(SubVecTy->getNumElements()); if (Subtarget->getMinSVEVectorSizeInBits() == Subtarget->getMaxSVEVectorSizeInBits() && Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(SubVecTy)) PgPattern = AArch64SVEPredPattern::all; auto *PTruePat = ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern); PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {PTruePat}); } for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { SmallVector Ops; // Split the shufflevector operands into sub vectors for the new stN call. for (unsigned i = 0; i < Factor; i++) { Value *Shuffle; unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Shuffle = Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i; if (Mask[IdxJ] >= 0) { StartMask = Mask[IdxJ] - j; break; } } // Note: Filling undef gaps with random elements is ok, since // those elements were being written anyway (with undefs). // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Shuffle = Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)); } if (UseScalable) Shuffle = Builder.CreateInsertVector( STVTy, UndefValue::get(STVTy), Shuffle, ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0)); Ops.push_back(Shuffle); } if (UseScalable) Ops.push_back(PTrue); // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); Builder.CreateCall(StNFunc, Ops); } return true; } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( IntrinsicInst *DI, LoadInst *LI) const { // Only deinterleave2 supported at present. if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) return false; // Only a factor of 2 supported at present. const unsigned Factor = 2; VectorType *VTy = cast(DI->getType()->getContainedType(0)); const DataLayout &DL = DI->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; // TODO: Add support for using SVE instructions with fixed types later, using // the code from lowerInterleavedLoad to obtain the correct container type. if (UseScalable && !VTy->isScalableTy()) return false; unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); VectorType *LdTy = VectorType::get(VTy->getElementType(), VTy->getElementCount().divideCoefficientBy(NumLoads)); Type *PtrTy = LI->getPointerOperandType(); Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor, UseScalable, LdTy, PtrTy); IRBuilder<> Builder(LI); Value *Pred = nullptr; if (UseScalable) Pred = Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); Value *BaseAddr = LI->getPointerOperand(); Value *Result; if (NumLoads > 1) { Value *Left = PoisonValue::get(VTy); Value *Right = PoisonValue::get(VTy); for (unsigned I = 0; I < NumLoads; ++I) { Value *Offset = Builder.getInt64(I * Factor); Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset}); Value *LdN = nullptr; if (UseScalable) LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN"); else LdN = Builder.CreateCall(LdNFunc, Address, "ldN"); Value *Idx = Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue()); Left = Builder.CreateInsertVector( VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx); Right = Builder.CreateInsertVector( VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx); } Result = PoisonValue::get(DI->getType()); Result = Builder.CreateInsertValue(Result, Left, 0); Result = Builder.CreateInsertValue(Result, Right, 1); } else { if (UseScalable) Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); } DI->replaceAllUsesWith(Result); return true; } bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( IntrinsicInst *II, StoreInst *SI) const { // Only interleave2 supported at present. if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) return false; // Only a factor of 2 supported at present. const unsigned Factor = 2; VectorType *VTy = cast(II->getOperand(0)->getType()); const DataLayout &DL = II->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; // TODO: Add support for using SVE instructions with fixed types later, using // the code from lowerInterleavedStore to obtain the correct container type. if (UseScalable && !VTy->isScalableTy()) return false; unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable); VectorType *StTy = VectorType::get(VTy->getElementType(), VTy->getElementCount().divideCoefficientBy(NumStores)); Type *PtrTy = SI->getPointerOperandType(); Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor, UseScalable, StTy, PtrTy); IRBuilder<> Builder(SI); Value *BaseAddr = SI->getPointerOperand(); Value *Pred = nullptr; if (UseScalable) Pred = Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue()); Value *L = II->getOperand(0); Value *R = II->getOperand(1); for (unsigned I = 0; I < NumStores; ++I) { Value *Address = BaseAddr; if (NumStores > 1) { Value *Offset = Builder.getInt64(I * Factor); Address = Builder.CreateGEP(StTy, BaseAddr, {Offset}); Value *Idx = Builder.getInt64(I * StTy->getElementCount().getKnownMinValue()); L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx); R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx); } if (UseScalable) Builder.CreateCall(StNFunc, {L, R, Pred, Address}); else Builder.CreateCall(StNFunc, {L, R, Address}); } return true; } EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. bool IsSmallMemset = Op.isMemset() && Op.size() < 32; auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { if (Op.isAligned(AlignCheck)) return true; unsigned Fast; return allowsMisalignedMemoryAccesses(VT, 0, Align(1), MachineMemOperand::MONone, &Fast) && Fast; }; if (CanUseNEON && Op.isMemset() && !IsSmallMemset && AlignmentIsAcceptable(MVT::v16i8, Align(16))) return MVT::v16i8; if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return MVT::f128; if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return MVT::i64; if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return MVT::i32; return MVT::Other; } LLT AArch64TargetLowering::getOptimalMemOpLLT( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. bool IsSmallMemset = Op.isMemset() && Op.size() < 32; auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { if (Op.isAligned(AlignCheck)) return true; unsigned Fast; return allowsMisalignedMemoryAccesses(VT, 0, Align(1), MachineMemOperand::MONone, &Fast) && Fast; }; if (CanUseNEON && Op.isMemset() && !IsSmallMemset && AlignmentIsAcceptable(MVT::v2i64, Align(16))) return LLT::fixed_vector(2, 64); if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) return LLT::scalar(128); if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) return LLT::scalar(64); if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) return LLT::scalar(32); return LLT(); } // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits::min()) { LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n"); return false; } // Same encoding for add/sub, just flip the sign. Immed = std::abs(Immed); bool IsLegal = ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); LLVM_DEBUG(dbgs() << "Is " << Immed << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); return IsLegal; } // Return false to prevent folding // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine, // if the folding leads to worse code. bool AArch64TargetLowering::isMulAddWithConstProfitable( SDValue AddNode, SDValue ConstNode) const { // Let the DAGCombiner decide for vector types and large types. const EVT VT = AddNode.getValueType(); if (VT.isVector() || VT.getScalarSizeInBits() > 64) return true; // It is worse if c1 is legal add immediate, while c1*c2 is not // and has to be composed by at least two instructions. const ConstantSDNode *C1Node = cast(AddNode.getOperand(1)); const ConstantSDNode *C2Node = cast(ConstNode); const int64_t C1 = C1Node->getSExtValue(); const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue(); if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue())) return true; SmallVector Insn; // Adapt to the width of a register. unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64; AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn); if (Insn.size() > 1) return false; // Default to true and let the DAGCombiner decide. return true; } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { return isLegalAddImmediate(Immed); } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AMode, Type *Ty, unsigned AS, Instruction *I) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset // reg + SIZE_IN_BYTES * 12-bit unsigned offset // reg1 + reg2 // reg + SIZE_IN_BYTES * reg // No global is ever allowed as a base. if (AMode.BaseGV) return false; // No reg+reg+imm addressing. if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale) return false; // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and // `2*ScaledReg` into `BaseReg + ScaledReg` AddrMode AM = AMode; if (AM.Scale && !AM.HasBaseReg) { if (AM.Scale == 1) { AM.HasBaseReg = true; AM.Scale = 0; } else if (AM.Scale == 2) { AM.HasBaseReg = true; AM.Scale = 1; } else { return false; } } // A base register is required in all addressing modes. if (!AM.HasBaseReg) return false; if (Ty->isScalableTy()) { if (isa(Ty)) { uint64_t VecElemNumBytes = DL.getTypeSizeInBits(cast(Ty)->getElementType()) / 8; return AM.HasBaseReg && !AM.BaseOffs && (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes); } return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; } // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; if (Ty->isSized()) { uint64_t NumBits = DL.getTypeSizeInBits(Ty); NumBytes = NumBits / 8; if (!isPowerOf2_64(NumBits)) NumBytes = 0; } if (!AM.Scale) { int64_t Offset = AM.BaseOffs; // 9-bit signed offset if (isInt<9>(Offset)) return true; // 12-bit unsigned offset unsigned shift = Log2_64(NumBytes); if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && // Must be a multiple of NumBytes (NumBytes is a power of 2) (Offset >> shift) << shift == Offset) return true; return false; } // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { // Consider splitting large offset of struct or array. return true; } bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f16: return Subtarget->hasFullFP16(); case MVT::f32: case MVT::f64: return true; default: break; } return false; } bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const { switch (Ty->getScalarType()->getTypeID()) { case Type::FloatTyID: case Type::DoubleTyID: return true; default: return false; } } bool AArch64TargetLowering::generateFMAsInMachineCombiner( EVT VT, CodeGenOpt::Level OptLevel) const { return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() && !useSVEForFixedLengthVectorVT(VT); } const MCPhysReg * AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call // site. Hence we include LR in the scratch registers, which are in turn added // as implicit-defs for stackmaps and patchpoints. static const MCPhysReg ScratchRegs[] = { AArch64::X16, AArch64::X17, AArch64::LR, 0 }; return ScratchRegs; } ArrayRef AArch64TargetLowering::getRoundingControlRegisters() const { static const MCPhysReg RCRegs[] = {AArch64::FPCR}; return RCRegs; } bool AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && "Expected shift op"); SDValue ShiftLHS = N->getOperand(0); EVT VT = N->getValueType(0); // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not // combine it with shift 'N' to let it be lowered to UBFX except: // ((x >> C) & mask) << C. if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && isa(ShiftLHS.getOperand(1))) { uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1); if (isMask_64(TruncMask)) { SDValue AndLHS = ShiftLHS.getOperand(0); if (AndLHS.getOpcode() == ISD::SRL) { if (auto *SRLC = dyn_cast(AndLHS.getOperand(1))) { if (N->getOpcode() == ISD::SHL) if (auto *SHLC = dyn_cast(N->getOperand(1))) return SRLC->getZExtValue() == SHLC->getZExtValue(); return false; } } } } return true; } bool AArch64TargetLowering::isDesirableToCommuteXorWithShift( const SDNode *N) const { assert(N->getOpcode() == ISD::XOR && (N->getOperand(0).getOpcode() == ISD::SHL || N->getOperand(0).getOpcode() == ISD::SRL) && "Expected XOR(SHIFT) pattern"); // Only commute if the entire NOT mask is a hidden shifted mask. auto *XorC = dyn_cast(N->getOperand(1)); auto *ShiftC = dyn_cast(N->getOperand(0).getOperand(1)); if (XorC && ShiftC) { unsigned MaskIdx, MaskLen; if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { unsigned ShiftAmt = ShiftC->getZExtValue(); unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); if (N->getOperand(0).getOpcode() == ISD::SHL) return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); } } return false; } bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); // Don't allow multiuse shift folding with the same shift amount. if (!N->getOperand(0)->hasOneUse()) return false; // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns. EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) { auto *C1 = dyn_cast(N->getOperand(0).getOperand(1)); auto *C2 = dyn_cast(N->getOperand(1)); return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); } return true; } bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant( unsigned BinOpcode, EVT VT) const { return VT.isScalableVector() && isTypeLegal(VT); } bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0) return false; int64_t Val = Imm.getSExtValue(); if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) return true; if ((int64_t)Val < 0) Val = ~Val; if (BitSize == 32) Val &= (1LL << 32) - 1; unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16; // MOVZ is free so return true for one or fewer MOVK. return Shift < 3; } bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorMinNumElements()); } /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: /// cmge X, X, #0 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!Subtarget->hasNEON() || !VT.isVector()) return SDValue(); // There must be a shift right algebraic before the xor, and the xor must be a // 'not' operation. SDValue Shift = N->getOperand(0); SDValue Ones = N->getOperand(1); if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || !ISD::isBuildVectorAllOnes(Ones.getNode())) return SDValue(); // The shift should be smearing the sign bit across each vector element. auto *ShiftAmt = dyn_cast(Shift.getOperand(1)); EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) return SDValue(); return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); } // Given a vecreduce_add node, detect the below pattern and convert it to the // node sequence with UABDL, [S|U]ADB and UADDLP. // // i32 vecreduce_add( // v16i32 abs( // v16i32 sub( // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b)))) // =================> // i32 vecreduce_add( // v4i32 UADDLP( // v8i16 add( // v8i16 zext( // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b // v8i16 zext( // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG) { // Assumed i32 vecreduce_add if (N->getValueType(0) != MVT::i32) return SDValue(); SDValue VecReduceOp0 = N->getOperand(0); unsigned Opcode = VecReduceOp0.getOpcode(); // Assumed v16i32 abs if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32) return SDValue(); SDValue ABS = VecReduceOp0; // Assumed v16i32 sub if (ABS->getOperand(0)->getOpcode() != ISD::SUB || ABS->getOperand(0)->getValueType(0) != MVT::v16i32) return SDValue(); SDValue SUB = ABS->getOperand(0); unsigned Opcode0 = SUB->getOperand(0).getOpcode(); unsigned Opcode1 = SUB->getOperand(1).getOpcode(); // Assumed v16i32 type if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 || SUB->getOperand(1)->getValueType(0) != MVT::v16i32) return SDValue(); // Assumed zext or sext bool IsZExt = false; if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) { IsZExt = true; } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) { IsZExt = false; } else return SDValue(); SDValue EXT0 = SUB->getOperand(0); SDValue EXT1 = SUB->getOperand(1); // Assumed zext's operand has v16i8 type if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 || EXT1->getOperand(0)->getValueType(0) != MVT::v16i8) return SDValue(); // Pattern is dectected. Let's convert it to sequence of nodes. SDLoc DL(N); // First, create the node pattern of UABD/SABD. SDValue UABDHigh8Op0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), DAG.getConstant(8, DL, MVT::i64)); SDValue UABDHigh8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), DAG.getConstant(8, DL, MVT::i64)); SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, UABDHigh8Op0, UABDHigh8Op1); SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8); // Second, create the node pattern of UABAL. SDValue UABDLo8Op0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), DAG.getConstant(0, DL, MVT::i64)); SDValue UABDLo8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), DAG.getConstant(0, DL, MVT::i64)); SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, UABDLo8Op0, UABDLo8Op1); SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8); SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD); // Third, create the node of UADDLP. SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL); // Fourth, create the node of VECREDUCE_ADD. return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP); } // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one)) // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B)) // If we have vectors larger than v16i8 we extract v16i8 vectors, // Follow the same steps above to get DOT instructions concatenate them // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)). static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST) { if (!ST->hasDotProd()) return performVecReduceAddCombineWithUADDLP(N, DAG); SDValue Op0 = N->getOperand(0); if (N->getValueType(0) != MVT::i32 || Op0.getValueType().getVectorElementType() != MVT::i32) return SDValue(); unsigned ExtOpcode = Op0.getOpcode(); SDValue A = Op0; SDValue B; if (ExtOpcode == ISD::MUL) { A = Op0.getOperand(0); B = Op0.getOperand(1); if (A.getOpcode() != B.getOpcode() || A.getOperand(0).getValueType() != B.getOperand(0).getValueType()) return SDValue(); ExtOpcode = A.getOpcode(); } if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) return SDValue(); EVT Op0VT = A.getOperand(0).getValueType(); bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0; bool IsValidSize = Op0VT.getScalarSizeInBits() == 8; if (!IsValidElementCount || !IsValidSize) return SDValue(); SDLoc DL(Op0); // For non-mla reductions B can be set to 1. For MLA we take the operand of // the extend B. if (!B) B = DAG.getConstant(1, DL, Op0VT); else B = B.getOperand(0); unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0; unsigned NumOfVecReduce; EVT TargetType; if (IsMultipleOf16) { NumOfVecReduce = Op0VT.getVectorNumElements() / 16; TargetType = MVT::v4i32; } else { NumOfVecReduce = Op0VT.getVectorNumElements() / 8; TargetType = MVT::v2i32; } auto DotOpcode = (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT; // Handle the case where we need to generate only one Dot operation. if (NumOfVecReduce == 1) { SDValue Zeros = DAG.getConstant(0, DL, TargetType); SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, A.getOperand(0), B); return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); } // Generate Dot instructions that are multiple of 16. unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16; SmallVector SDotVec16; unsigned I = 0; for (; I < VecReduce16Num; I += 1) { SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32); SDValue Op0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0), DAG.getConstant(I * 16, DL, MVT::i64)); SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B, DAG.getConstant(I * 16, DL, MVT::i64)); SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1); SDotVec16.push_back(Dot); } // Concatenate dot operations. EVT SDot16EVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num); SDValue ConcatSDot16 = DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16); SDValue VecReduceAdd16 = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16); unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8; if (VecReduce8Num == 0) return VecReduceAdd16; // Generate the remainder Dot operation that is multiple of 8. SmallVector SDotVec8; SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32); SDValue Vec8Op0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0), DAG.getConstant(I * 16, DL, MVT::i64)); SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B, DAG.getConstant(I * 16, DL, MVT::i64)); SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1); SDValue VecReudceAdd8 = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16, VecReudceAdd8); } // Given an (integer) vecreduce, we know the order of the inputs does not // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) // into UADDV(UADDLP(x)). This can also happen through an extra add, where we // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) { auto DetectAddExtract = [&](SDValue A) { // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning // UADDLP(x) if found. if (A.getOpcode() != ISD::ADD) return SDValue(); EVT VT = A.getValueType(); SDValue Op0 = A.getOperand(0); SDValue Op1 = A.getOperand(1); if (Op0.getOpcode() != Op0.getOpcode() || (Op0.getOpcode() != ISD::ZERO_EXTEND && Op0.getOpcode() != ISD::SIGN_EXTEND)) return SDValue(); SDValue Ext0 = Op0.getOperand(0); SDValue Ext1 = Op1.getOperand(0); if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR || Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR || Ext0.getOperand(0) != Ext1.getOperand(0)) return SDValue(); // Check that the type is twice the add types, and the extract are from // upper/lower parts of the same source. if (Ext0.getOperand(0).getValueType().getVectorNumElements() != VT.getVectorNumElements() * 2) return SDValue(); if ((Ext0.getConstantOperandVal(1) != 0 && Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && (Ext1.getConstantOperandVal(1) != 0 && Ext0.getConstantOperandVal(1) != VT.getVectorNumElements())) return SDValue(); unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP : AArch64ISD::SADDLP; return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0)); }; if (SDValue R = DetectAddExtract(A)) return R; if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse()) if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG)) return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, A.getOperand(1)); if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse()) if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG)) return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, A.getOperand(0)); return SDValue(); } static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { SDValue A = N->getOperand(0); if (A.getOpcode() == ISD::ADD) if (SDValue R = performUADDVAddCombine(A, DAG)) return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R); return SDValue(); } static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); return foldVectorXorShiftIntoCmp(N, DAG, Subtarget); } SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV EVT VT = N->getValueType(0); // For scalable and fixed types, mark them as cheap so we can handle it much // later. This allows us to handle larger than legal types. if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) return SDValue(N, 0); // fold (sdiv X, pow2) if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countr_zero(); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); // Add (N0 < 0) ? Pow2 - 1 : 0; SDValue CCVal; SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); Created.push_back(Cmp.getNode()); Created.push_back(Add.getNode()); Created.push_back(CSel.getNode()); // Divide by pow2. SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (Divisor.isNonNegative()) return SRA; Created.push_back(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } SDValue AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N, 0); // Lower SREM as SREM EVT VT = N->getValueType(0); // For scalable and fixed types, mark them as cheap so we can handle it much // later. This allows us to handle larger than legal types. if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) return SDValue(N, 0); // fold (srem X, pow2) if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) return SDValue(); unsigned Lg2 = Divisor.countr_zero(); if (Lg2 == 0) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue CCVal, CSNeg; if (Lg2 == 1) { SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL); SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp); Created.push_back(Cmp.getNode()); Created.push_back(And.getNode()); } else { SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne); CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal, Negs.getValue(1)); Created.push_back(Negs.getNode()); Created.push_back(AndPos.getNode()); Created.push_back(AndNeg.getNode()); } return CSNeg; } static std::optional IsSVECntIntrinsic(SDValue S) { switch(getIntrinsicID(S.getNode())) { default: break; case Intrinsic::aarch64_sve_cntb: return 8; case Intrinsic::aarch64_sve_cnth: return 16; case Intrinsic::aarch64_sve_cntw: return 32; case Intrinsic::aarch64_sve_cntd: return 64; } return {}; } /// Calculates what the pre-extend type is, based on the extension /// operation node provided by \p Extend. /// /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the /// pre-extend type is pulled directly from the operand, while other extend /// operations need a bit more inspection to get this information. /// /// \param Extend The SDNode from the DAG that represents the extend operation /// /// \returns The type representing the \p Extend source type, or \p MVT::Other /// if no valid type can be determined static EVT calculatePreExtendType(SDValue Extend) { switch (Extend.getOpcode()) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return Extend.getOperand(0).getValueType(); case ISD::AssertSext: case ISD::AssertZext: case ISD::SIGN_EXTEND_INREG: { VTSDNode *TypeNode = dyn_cast(Extend.getOperand(1)); if (!TypeNode) return MVT::Other; return TypeNode->getVT(); } case ISD::AND: { ConstantSDNode *Constant = dyn_cast(Extend.getOperand(1).getNode()); if (!Constant) return MVT::Other; uint32_t Mask = Constant->getZExtValue(); if (Mask == UCHAR_MAX) return MVT::i8; else if (Mask == USHRT_MAX) return MVT::i16; else if (Mask == UINT_MAX) return MVT::i32; return MVT::Other; } default: return MVT::Other; } } /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector /// SExt/ZExt rather than the scalar SExt/ZExt static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) { EVT VT = BV.getValueType(); if (BV.getOpcode() != ISD::BUILD_VECTOR && BV.getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); // Use the first item in the buildvector/shuffle to get the size of the // extend, and make sure it looks valid. SDValue Extend = BV->getOperand(0); unsigned ExtendOpcode = Extend.getOpcode(); bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || ExtendOpcode == ISD::SIGN_EXTEND_INREG || ExtendOpcode == ISD::AssertSext; if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) return SDValue(); // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure // calculatePreExtendType will work without issue. if (BV.getOpcode() == ISD::VECTOR_SHUFFLE && ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND) return SDValue(); // Restrict valid pre-extend data type EVT PreExtendType = calculatePreExtendType(Extend); if (PreExtendType == MVT::Other || PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2) return SDValue(); // Make sure all other operands are equally extended for (SDValue Op : drop_begin(BV->ops())) { if (Op.isUndef()) continue; unsigned Opc = Op.getOpcode(); bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG || Opc == ISD::AssertSext; if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType) return SDValue(); } SDValue NBV; SDLoc DL(BV); if (BV.getOpcode() == ISD::BUILD_VECTOR) { EVT PreExtendVT = VT.changeVectorElementType(PreExtendType); EVT PreExtendLegalType = PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType; SmallVector NewOps; for (SDValue Op : BV->ops()) NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType) : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, PreExtendLegalType)); NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps); } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType()); NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0), BV.getOperand(1).isUndef() ? DAG.getUNDEF(PreExtendVT) : BV.getOperand(1).getOperand(0), cast(BV)->getMask()); } return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV); } /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { // If the value type isn't a vector, none of the operands are going to be dups EVT VT = Mul->getValueType(0); if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64) return SDValue(); SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG); SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG); // Neither operands have been changed, don't make any further changes if (!Op0 && !Op1) return SDValue(); SDLoc DL(Mul); return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0), Op1 ? Op1 : Mul->getOperand(1)); } // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz // Same for other types with equivalent constants. static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 && VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::AND || N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) return SDValue(); SDValue And = N->getOperand(0); SDValue Srl = And.getOperand(0); APInt V1, V2, V3; if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) return SDValue(); unsigned HalfSize = VT.getScalarSizeInBits() / 2; if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || V3 != (HalfSize - 1)) return SDValue(); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), EVT::getIntegerVT(*DAG.getContext(), HalfSize), VT.getVectorElementCount() * 2); SDLoc DL(N); SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0)); SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In); return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); } static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) return Ext; if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG)) return Ext; if (DCI.isBeforeLegalizeOps()) return SDValue(); // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y, // and in MachineCombiner pass, add+mul will be combined into madd. // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X. SDLoc DL(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue MulOper; unsigned AddSubOpc; auto IsAddSubWith1 = [&](SDValue V) -> bool { AddSubOpc = V->getOpcode(); if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) { SDValue Opnd = V->getOperand(1); MulOper = V->getOperand(0); if (AddSubOpc == ISD::SUB) std::swap(Opnd, MulOper); if (auto C = dyn_cast(Opnd)) return C->isOne(); } return false; }; if (IsAddSubWith1(N0)) { SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper); return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal); } if (IsAddSubWith1(N1)) { SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper); return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal); } // The below optimizations require a constant RHS. if (!isa(N1)) return SDValue(); ConstantSDNode *C = cast(N1); const APInt &ConstValue = C->getAPIntValue(); // Allow the scaling to be folded into the `cnt` instruction by preventing // the scaling to be obscured here. This makes it easier to pattern match. if (IsSVECntIntrinsic(N0) || (N0->getOpcode() == ISD::TRUNCATE && (IsSVECntIntrinsic(N0->getOperand(0))))) if (ConstValue.sge(1) && ConstValue.sle(16)) return SDValue(); // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. // More aggressively, some multiplications N0 * C can be lowered to // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8) // TODO: lower more cases. // TrailingZeroes is used to test if the mul can be lowered to // shift+add+shift. unsigned TrailingZeroes = ConstValue.countr_zero(); if (TrailingZeroes) { // Conservatively do not lower to shift+add+shift if the mul might be // folded into smul or umul. if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || isZeroExtended(N0.getNode(), DAG))) return SDValue(); // Conservatively do not lower to shift+add+shift if the mul might be // folded into madd or msub. if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || N->use_begin()->getOpcode() == ISD::SUB)) return SDValue(); } // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub // and shift+add+shift. APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); unsigned ShiftAmt; auto Shl = [&](SDValue N0, unsigned N1) { SDValue RHS = DAG.getConstant(N1, DL, MVT::i64); return DAG.getNode(ISD::SHL, DL, VT, N0, RHS); }; auto Add = [&](SDValue N0, SDValue N1) { return DAG.getNode(ISD::ADD, DL, VT, N0, N1); }; auto Sub = [&](SDValue N0, SDValue N1) { return DAG.getNode(ISD::SUB, DL, VT, N0, N1); }; auto Negate = [&](SDValue N) { SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::SUB, DL, VT, Zero, N); }; // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg: // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as // the (2^N - 1) can't be execused via a single instruction. auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) { unsigned BitWidth = C.getBitWidth(); for (unsigned i = 1; i < BitWidth / 2; i++) { APInt Rem; APInt X(BitWidth, (1 << i) + 1); APInt::sdivrem(C, X, N, Rem); APInt NVMinus1 = N - 1; if (Rem == 0 && NVMinus1.isPowerOf2()) { M = X; return true; } } return false; }; if (ConstValue.isNonNegative()) { // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) // (mul x, 2^N - 1) => (sub (shl x, N), x) // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M)) // (mul x, (2^M + 1) * (2^N + 1)) // => MV = (add (shl x, M), x); (add (shl MV, N), MV) APInt SCVMinus1 = ShiftedConstValue - 1; APInt SCVPlus1 = ShiftedConstValue + 1; APInt CVPlus1 = ConstValue + 1; APInt CVM, CVN; if (SCVMinus1.isPowerOf2()) { ShiftAmt = SCVMinus1.logBase2(); return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes); } else if (CVPlus1.isPowerOf2()) { ShiftAmt = CVPlus1.logBase2(); return Sub(Shl(N0, ShiftAmt), N0); } else if (SCVPlus1.isPowerOf2()) { ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes)); } else if (Subtarget->hasLSLFast() && isPowPlusPlusConst(ConstValue, CVM, CVN)) { APInt CVMMinus1 = CVM - 1; APInt CVNMinus1 = CVN - 1; unsigned ShiftM1 = CVMMinus1.logBase2(); unsigned ShiftN1 = CVNMinus1.logBase2(); // LSLFast implicate that Shifts <= 3 places are fast if (ShiftM1 <= 3 && ShiftN1 <= 3) { SDValue MVal = Add(Shl(N0, ShiftM1), N0); return Add(Shl(MVal, ShiftN1), MVal); } } } else { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) // (mul x, -(2^N + 1)) => - (add (shl x, N), x) // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N)) APInt SCVPlus1 = -ShiftedConstValue + 1; APInt CVNegPlus1 = -ConstValue + 1; APInt CVNegMinus1 = -ConstValue - 1; if (CVNegPlus1.isPowerOf2()) { ShiftAmt = CVNegPlus1.logBase2(); return Sub(N0, Shl(N0, ShiftAmt)); } else if (CVNegMinus1.isPowerOf2()) { ShiftAmt = CVNegMinus1.logBase2(); return Negate(Add(Shl(N0, ShiftAmt), N0)); } else if (SCVPlus1.isPowerOf2()) { ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt)); } } return SDValue(); } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); return Res; } return SDValue(); } static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Only optimize when the source and destination types have the same width. if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), LN0->getAlign(), LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); unsigned Opcode = (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; return DAG.getNode(Opcode, SDLoc(N), VT, Load); } return SDValue(); } /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->isNeonAvailable()) return SDValue(); if (!N->getValueType(0).isSimple()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) return SDValue(); if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector()) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64 && (FloatBits != 16 || !Subtarget->hasFullFP16())) return SDValue(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., float -> i64). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t Bits = IntBits == 64 ? 64 : 32; int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); if (C == -1 || C == 0 || C > Bits) return SDValue(); EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger(); if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy)) return SDValue(); if (N->getOpcode() == ISD::FP_TO_SINT_SAT || N->getOpcode() == ISD::FP_TO_UINT_SAT) { EVT SatVT = cast(N->getOperand(1))->getVT(); if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits) return SDValue(); } SDLoc DL(N); bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT || N->getOpcode() == ISD::FP_TO_SINT_SAT); unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); return FixConv; } /// Fold a floating-point divide by power of two into fixed-point to /// floating-point conversion. static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned Opc = Op->getOpcode(); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || !Op.getOperand(0).getValueType().isSimple() || (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); int32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); int32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., i64 -> float). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); if (C == -1 || C == 0 || C > FloatBits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) return SDValue(); SDLoc DL(N); SDValue ConvInput = Op.getOperand(0); bool IsSigned = Opc == ISD::SINT_TO_FP; if (IntBits < FloatBits) ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, ResTy, ConvInput); unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp : Intrinsic::aarch64_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, DAG.getConstant(C, DL, MVT::i32)); } /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi) { if (N.getOpcode() == ISD::SHL) FromHi = false; else if (N.getOpcode() == ISD::SRL) FromHi = true; else return false; if (!isa(N.getOperand(1))) return false; ShiftAmount = N->getConstantOperandVal(1); Src = N->getOperand(0); return true; } /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: /// (or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N)) and replaces it /// with an EXTR. Can't quite be done in TableGen because the two immediates /// aren't independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); SDValue LHS; uint32_t ShiftLHS = 0; bool LHSFromHi = false; if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) return SDValue(); SDValue RHS; uint32_t ShiftRHS = 0; bool RHSFromHi = false; if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) return SDValue(); // If they're both trying to come from the high part of the register, they're // not really an EXTR. if (LHSFromHi == RHSFromHi) return SDValue(); if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) return SDValue(); if (LHSFromHi) { std::swap(LHS, RHS); std::swap(ShiftLHS, ShiftRHS); } return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, DAG.getConstant(ShiftRHS, DL, MVT::i64)); } static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); if (!VT.isVector()) return SDValue(); // The combining code currently only works for NEON vectors. In particular, // it does not work for SVE when dealing with vectors wider than 128 bits. // It also doesn't work for streaming mode because it causes generating // bsl instructions that are invalid in streaming mode. if (TLI.useSVEForFixedLengthVectorVT( VT, !DAG.getSubtarget().isNeonAvailable())) return SDValue(); SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::AND) return SDValue(); SDValue N1 = N->getOperand(1); if (N1.getOpcode() != ISD::AND) return SDValue(); // InstCombine does (not (neg a)) => (add a -1). // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) // Loop over all combinations of AND operands. for (int i = 1; i >= 0; --i) { for (int j = 1; j >= 0; --j) { SDValue O0 = N0->getOperand(i); SDValue O1 = N1->getOperand(j); SDValue Sub, Add, SubSibling, AddSibling; // Find a SUB and an ADD operand, one from each AND. if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { Sub = O0; Add = O1; SubSibling = N0->getOperand(1 - i); AddSibling = N1->getOperand(1 - j); } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { Add = O0; Sub = O1; AddSibling = N0->getOperand(1 - i); SubSibling = N1->getOperand(1 - j); } else continue; if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode())) continue; // Constant ones is always righthand operand of the Add. if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode())) continue; if (Sub.getOperand(1) != Add.getOperand(0)) continue; return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); } } // (or (and a b) (and (not a) c)) => (bsl a b c) // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. unsigned Bits = VT.getScalarSizeInBits(); uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); for (int i = 1; i >= 0; --i) for (int j = 1; j >= 0; --j) { BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(i)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(j)); if (!BVN0 || !BVN1) continue; bool FoundMatch = true; for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); if (!CN0 || !CN1 || CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { FoundMatch = false; break; } } if (FoundMatch) return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } return SDValue(); } // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to // convert to csel(ccmp(.., cc0)), depending on cc1: // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) // => // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0)) // // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) // => // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0)) static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue CSel0 = N->getOperand(0); SDValue CSel1 = N->getOperand(1); if (CSel0.getOpcode() != AArch64ISD::CSEL || CSel1.getOpcode() != AArch64ISD::CSEL) return SDValue(); if (!CSel0->hasOneUse() || !CSel1->hasOneUse()) return SDValue(); if (!isNullConstant(CSel0.getOperand(0)) || !isOneConstant(CSel0.getOperand(1)) || !isNullConstant(CSel1.getOperand(0)) || !isOneConstant(CSel1.getOperand(1))) return SDValue(); SDValue Cmp0 = CSel0.getOperand(3); SDValue Cmp1 = CSel1.getOperand(3); AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2); AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2); if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) return SDValue(); if (Cmp1.getOpcode() != AArch64ISD::SUBS && Cmp0.getOpcode() == AArch64ISD::SUBS) { std::swap(Cmp0, Cmp1); std::swap(CC0, CC1); } if (Cmp1.getOpcode() != AArch64ISD::SUBS) return SDValue(); SDLoc DL(N); SDValue CCmp, Condition; unsigned NZCV; if (N->getOpcode() == ISD::AND) { AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); Condition = DAG.getConstant(InvCC0, DL, MVT_CC); NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); } else { AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); Condition = DAG.getConstant(CC0, DL, MVT_CC); NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); } SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); auto *Op1 = dyn_cast(Cmp1.getOperand(1)); if (Op1 && Op1->getAPIntValue().isNegative() && Op1->getAPIntValue().sgt(-32)) { // CCMP accept the constant int the range [0, 31] // if the Op1 is a constant in the range [-31, -1], we // can select to CCMN to avoid the extra mov SDValue AbsOp1 = DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0)); CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1, NZCVOp, Condition, Cmp0); } else { CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); } return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), CCmp); } static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (SDValue R = performANDORCSELCombine(N, DAG)) return R; if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; if (SDValue Res = tryCombineToBSL(N, DCI, TLI)) return Res; return SDValue(); } static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) { if (!MemVT.getVectorElementType().isSimple()) return false; uint64_t MaskForTy = 0ull; switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) { case MVT::i8: MaskForTy = 0xffull; break; case MVT::i16: MaskForTy = 0xffffull; break; case MVT::i32: MaskForTy = 0xffffffffull; break; default: return false; break; } if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR) if (auto *Op0 = dyn_cast(N->getOperand(0))) return Op0->getAPIntValue().getLimitedValue() == MaskForTy; return false; } static bool isAllInactivePredicate(SDValue N) { // Look through cast. while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) N = N.getOperand(0); return ISD::isConstantSplatVectorAllZeros(N.getNode()); } static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { unsigned NumElts = N.getValueType().getVectorMinNumElements(); // Look through cast. while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) { N = N.getOperand(0); // When reinterpreting from a type with fewer elements the "new" elements // are not active, so bail if they're likely to be used. if (N.getValueType().getVectorMinNumElements() < NumElts) return false; } if (ISD::isConstantSplatVectorAllOnes(N.getNode())) return true; // "ptrue p., all" can be considered all active when is the same size // or smaller than the implicit element type represented by N. // NOTE: A larger element count implies a smaller element type. if (N.getOpcode() == AArch64ISD::PTRUE && N.getConstantOperandVal(0) == AArch64SVEPredPattern::all) return N.getValueType().getVectorMinNumElements() >= NumElts; // If we're compiling for a specific vector-length, we can check if the // pattern's VL equals that of the scalable vector at runtime. if (N.getOpcode() == AArch64ISD::PTRUE) { const auto &Subtarget = DAG.getSubtarget(); unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); if (MaxSVESize && MinSVESize == MaxSVESize) { unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock; unsigned PatNumElts = getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0)); return PatNumElts == (NumElts * VScale); } } return false; } static SDValue performReinterpretCastCombine(SDNode *N) { SDValue LeafOp = SDValue(N, 0); SDValue Op = N->getOperand(0); while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST && LeafOp.getValueType() != Op.getValueType()) Op = Op->getOperand(0); if (LeafOp.getValueType() == Op.getValueType()) return Op; return SDValue(); } static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDValue Src = N->getOperand(0); unsigned Opc = Src->getOpcode(); // Zero/any extend of an unsigned unpack if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { SDValue UnpkOp = Src->getOperand(0); SDValue Dup = N->getOperand(1); if (Dup.getOpcode() != ISD::SPLAT_VECTOR) return SDValue(); SDLoc DL(N); ConstantSDNode *C = dyn_cast(Dup->getOperand(0)); if (!C) return SDValue(); uint64_t ExtVal = C->getZExtValue(); auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool { return ((ExtVal == 0xFF && VT == MVT::i8) || (ExtVal == 0xFFFF && VT == MVT::i16) || (ExtVal == 0xFFFFFFFF && VT == MVT::i32)); }; // If the mask is fully covered by the unpack, we don't need to push // a new AND onto the operand EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); if (MaskAndTypeMatch(EltTy)) return Src; // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check // to see if the mask is all-ones of size MemTy. auto MaskedLoadOp = dyn_cast(UnpkOp); if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD || MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) { EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType(); if (MaskAndTypeMatch(EltTy)) return Src; } // Truncate to prevent a DUP with an over wide constant APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); // Otherwise, make sure we propagate the AND to the operand // of the unpack Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0), DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); SDValue And = DAG.getNode(ISD::AND, DL, UnpkOp->getValueType(0), UnpkOp, Dup); return DAG.getNode(Opc, DL, N->getValueType(0), And); } // If both sides of AND operations are i1 splat_vectors then // we can produce just i1 splat_vector as the result. if (isAllActivePredicate(DAG, N->getOperand(0))) return N->getOperand(1); if (isAllActivePredicate(DAG, N->getOperand(1))) return N->getOperand(0); if (!EnableCombineMGatherIntrinsics) return SDValue(); SDValue Mask = N->getOperand(1); if (!Src.hasOneUse()) return SDValue(); EVT MemVT; // SVE load instructions perform an implicit zero-extend, which makes them // perfect candidates for combining. switch (Opc) { case AArch64ISD::LD1_MERGE_ZERO: case AArch64ISD::LDNF1_MERGE_ZERO: case AArch64ISD::LDFF1_MERGE_ZERO: MemVT = cast(Src->getOperand(3))->getVT(); break; case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_SXTW_MERGE_ZERO: case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_IMM_MERGE_ZERO: case AArch64ISD::GLDFF1_MERGE_ZERO: case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: case AArch64ISD::GLDNT1_MERGE_ZERO: MemVT = cast(Src->getOperand(4))->getVT(); break; default: return SDValue(); } if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) return Src; return SDValue(); } static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); if (SDValue R = performANDORCSELCombine(N, DAG)) return R; if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); // The combining code below works only for NEON vectors. In particular, it // does not work for SVE when dealing with vectors wider than 128 bits. if (!VT.is64BitVector() && !VT.is128BitVector()) return SDValue(); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); if (!BVN) return SDValue(); // AND does not accept an immediate, so check if we can use a BIC immediate // instruction instead. We do this here instead of using a (and x, (mvni imm)) // pattern in isel, because some immediates may be lowered to the preferred // (and x, (movi imm)) form, even though an mvni representation also exists. APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; // Any bits known to already be 0 need not be cleared again, which can help // reduce the size of the immediate to one supported by the instruction. KnownBits Known = DAG.computeKnownBits(LHS); APInt ZeroSplat(VT.getSizeInBits(), 0); for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++) ZeroSplat |= Known.Zero.zext(VT.getSizeInBits()) << (Known.Zero.getBitWidth() * I); DefBits = ~(DefBits | ZeroSplat); if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, DefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, DefBits, &LHS))) return NewOp; UndefBits = ~(UndefBits | ZeroSplat); if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, UndefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, UndefBits, &LHS))) return NewOp; } return SDValue(); } static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (!N->getFlags().hasAllowReassociation()) return SDValue(); // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c) auto ReassocComplex = [&](SDValue A, SDValue B) { if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN) return SDValue(); unsigned Opc = A.getConstantOperandVal(0); if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 && Opc != Intrinsic::aarch64_neon_vcmla_rot90 && Opc != Intrinsic::aarch64_neon_vcmla_rot180 && Opc != Intrinsic::aarch64_neon_vcmla_rot270) return SDValue(); SDValue VCMLA = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()), A.getOperand(2), A.getOperand(3)); VCMLA->setFlags(A->getFlags()); return VCMLA; }; if (SDValue R = ReassocComplex(LHS, RHS)) return R; if (SDValue R = ReassocComplex(RHS, LHS)) return R; return SDValue(); } static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { switch (Opcode) { case ISD::STRICT_FADD: case ISD::FADD: return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; case ISD::ADD: return VT == MVT::i64; default: return false; } } static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond); static bool isPredicateCCSettingOp(SDValue N) { if ((N.getOpcode() == ISD::SETCC) || (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN && (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt || // get_active_lane_mask is lowered to a whilelo instruction. N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask))) return true; return false; } // Materialize : i1 = extract_vector_elt t37, Constant:i64<0> // ... into: "ptrue p, all" + PTEST static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); // Make sure PTEST can be legalised with illegal types. if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) return SDValue(); SDValue N0 = N->getOperand(0); EVT VT = N0.getValueType(); if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 || !isNullConstant(N->getOperand(1))) return SDValue(); // Restricted the DAG combine to only cases where we're extracting from a // flag-setting operation. if (!isPredicateCCSettingOp(N0)) return SDValue(); // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0 SelectionDAG &DAG = DCI.DAG; SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all); return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE); } // Materialize : Idx = (add (mul vscale, NumEls), -1) // i1 = extract_vector_elt t37, Constant:i64 // ... into: "ptrue p, all" + PTEST static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); // Make sure PTEST is legal types. if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) return SDValue(); SDValue N0 = N->getOperand(0); EVT OpVT = N0.getValueType(); if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) return SDValue(); // Idx == (add (mul vscale, NumEls), -1) SDValue Idx = N->getOperand(1); if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1))) return SDValue(); SDValue VS = Idx.getOperand(0); if (VS.getOpcode() != ISD::VSCALE) return SDValue(); unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue(); if (VS.getConstantOperandVal(0) != NumEls) return SDValue(); // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0 SelectionDAG &DAG = DCI.DAG; SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all); return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE); } static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget)) return Res; if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget)) return Res; SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); EVT VT = N->getValueType(0); const bool FullFP16 = DAG.getSubtarget().hasFullFP16(); bool IsStrict = N0->isStrictFPOpcode(); // extract(dup x) -> x if (N0.getOpcode() == AArch64ISD::DUP) return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT) : N0.getOperand(0); // Rewrite for pairwise fadd pattern // (f32 (extract_vector_elt // (fadd (vXf32 Other) // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) // -> // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) // (extract_vector_elt (vXf32 Other) 1)) // For strict_fadd we need to make sure the old strict_fadd can be deleted, so // we can only do this when it's used only by the extract_vector_elt. if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) && (!IsStrict || N0.hasOneUse())) { SDLoc DL(N0); SDValue N00 = N0->getOperand(IsStrict ? 1 : 0); SDValue N01 = N0->getOperand(IsStrict ? 2 : 1); ShuffleVectorSDNode *Shuffle = dyn_cast(N01); SDValue Other = N00; // And handle the commutative case. if (!Shuffle) { Shuffle = dyn_cast(N00); Other = N01; } if (Shuffle && Shuffle->getMaskElt(0) == 1 && Other == Shuffle->getOperand(0)) { SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, DAG.getConstant(0, DL, MVT::i64)); SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, DAG.getConstant(1, DL, MVT::i64)); if (!IsStrict) return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2); // For strict_fadd we need uses of the final extract_vector to be replaced // with the strict_fadd, but we also need uses of the chain output of the // original strict_fadd to use the chain output of the new strict_fadd as // otherwise it may not be deleted. SDValue Ret = DAG.getNode(N0->getOpcode(), DL, {VT, MVT::Other}, {N0->getOperand(0), Extract1, Extract2}); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1)); return SDValue(N, 0); } } return SDValue(); } static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); if (VT.isScalableVector()) return SDValue(); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), // (v2i16 (truncate (v2i64))))) // -> // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), // (v4i32 (bitcast (v2i64))), // <0, 2, 4, 6>))) // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && N1Opc == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); if (N00VT == N10.getValueType() && (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); SmallVector Mask(MidVT.getVectorNumElements()); for (size_t i = 0; i < Mask.size(); ++i) Mask[i] = i * 2; return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getVectorShuffle( MidVT, dl, DAG.getNode(ISD::BITCAST, dl, MidVT, N00), DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); } } if (N->getOperand(0).getValueType() == MVT::v4i8) { // If we have a concat of v4i8 loads, convert them to a buildvector of f32 // loads to prevent having to go through the v4i8 load legalization that // needs to extend each element into a larger type. if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) { if (V.getValueType() != MVT::v4i8) return false; if (V.isUndef()) return true; LoadSDNode *LD = dyn_cast(V); return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() && LD->getExtensionType() == ISD::NON_EXTLOAD; })) { EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands()); SmallVector Ops; for (unsigned i = 0; i < N->getNumOperands(); i++) { SDValue V = N->getOperand(i); if (V.isUndef()) Ops.push_back(DAG.getUNDEF(MVT::f32)); else { LoadSDNode *LD = cast(V); SDValue NewLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(), LD->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); Ops.push_back(NewLoad); } } return DAG.getBitcast(N->getValueType(0), DAG.getBuildVector(NVT, dl, Ops)); } } // Canonicalise concat_vectors to replace concatenations of truncated nots // with nots of concatenated truncates. This in some cases allows for multiple // redundant negations to be eliminated. // (concat_vectors (v4i16 (truncate (not (v4i32)))), // (v4i16 (truncate (not (v4i32))))) // -> // (not (concat_vectors (v4i16 (truncate (v4i32))), // (v4i16 (truncate (v4i32))))) if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) && N->isOnlyUserOf(N1.getNode())) { auto isBitwiseVectorNegate = [](SDValue V) { return V->getOpcode() == ISD::XOR && ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode()); }; SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) && isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) { return DAG.getNOT( dl, DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(), N00->getOperand(0)), DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(), N10->getOperand(0))), VT); } } // Wait till after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use // extracted subvectors from the same original vectors. Combine these into a // single avg that operates on the two original vectors. // avgceil is the target independant name for rhadd, avgfloor is a hadd. // Example: // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>), // extract_subvector (v16i8 OpB, <0>))), // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>), // extract_subvector (v16i8 OpB, <8>))))) // -> // (v16i8(avgceils(v16i8 OpA, v16i8 OpB))) if (N->getNumOperands() == 2 && N0Opc == N1Opc && (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS || N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); SDValue N11 = N1->getOperand(1); EVT N00VT = N00.getValueType(); EVT N10VT = N10.getValueType(); if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { SDValue N00Source = N00->getOperand(0); SDValue N01Source = N01->getOperand(0); SDValue N10Source = N10->getOperand(0); SDValue N11Source = N11->getOperand(0); if (N00Source == N10Source && N01Source == N11Source && N00Source.getValueType() == VT && N01Source.getValueType() == VT) { assert(N0.getValueType() == N1.getValueType()); uint64_t N00Index = N00.getConstantOperandVal(1); uint64_t N01Index = N01.getConstantOperandVal(1); uint64_t N10Index = N10.getConstantOperandVal(1); uint64_t N11Index = N11.getConstantOperandVal(1); if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && N10Index == N00VT.getVectorNumElements()) return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); } } } auto IsRSHRN = [](SDValue Shr) { if (Shr.getOpcode() != AArch64ISD::VLSHR) return false; SDValue Op = Shr.getOperand(0); EVT VT = Op.getValueType(); unsigned ShtAmt = Shr.getConstantOperandVal(1); if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD) return false; APInt Imm; if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift) Imm = APInt(VT.getScalarSizeInBits(), Op.getOperand(1).getConstantOperandVal(0) << Op.getOperand(1).getConstantOperandVal(1)); else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP && isa(Op.getOperand(1).getOperand(0))) Imm = APInt(VT.getScalarSizeInBits(), Op.getOperand(1).getConstantOperandVal(0)); else return false; if (Imm != 1ULL << (ShtAmt - 1)) return false; return true; }; // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y)) if (N->getNumOperands() == 2 && IsRSHRN(N0) && ((IsRSHRN(N1) && N0.getConstantOperandVal(1) == N1.getConstantOperandVal(1)) || N1.isUndef())) { SDValue X = N0.getOperand(0).getOperand(0); SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType()) : N1.getOperand(0).getOperand(0); EVT BVT = X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y); SDValue Add = DAG.getNode( ISD::ADD, dl, BVT, CC, DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT)); SDValue Shr = DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1)); return Shr; } // concat(zip1(a, b), zip2(a, b)) is zip1(a, b) if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 && N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) && N0.getOperand(1) == N1.getOperand(1)) { SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0), DAG.getUNDEF(N0.getValueType())); SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1), DAG.getUNDEF(N0.getValueType())); return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1); } // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getScalarSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); } // Canonicalise concat_vectors so that the right-hand vector has as few // bit-casts as possible before its real operation. The primary matching // destination for these operations will be the narrowing "2" instructions, // which depend on the operation being performed on this right-hand vector. // For example, // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); // If the RHS is not a vector, this is not the pattern we're looking for. if (!RHSTy.isVector()) return SDValue(); LLVM_DEBUG( dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), RHSTy.getVectorNumElements() * 2); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), RHS)); } static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); EVT VT = N->getValueType(0); if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) return SDValue(); SDValue V = N->getOperand(0); // NOTE: This combine exists in DAGCombiner, but that version's legality check // blocks this combine because the non-const case requires custom lowering. // // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const) if (V.getOpcode() == ISD::SPLAT_VECTOR) if (isa(V.getOperand(0))) return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0)); return SDValue(); } static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc DL(N); SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); uint64_t IdxVal = N->getConstantOperandVal(2); EVT VecVT = Vec.getValueType(); EVT SubVT = SubVec.getValueType(); // Only do this for legal fixed vector types. if (!VecVT.isFixedLengthVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || !DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) return SDValue(); // Ignore widening patterns. if (IdxVal == 0 && Vec.isUndef()) return SDValue(); // Subvector must be half the width and an "aligned" insertion. unsigned NumSubElts = SubVT.getVectorNumElements(); if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() || (IdxVal != 0 && IdxVal != NumSubElts)) return SDValue(); // Fold insert_subvector -> concat_vectors // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) SDValue Lo, Hi; if (IdxVal == 0) { Lo = SubVec; Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, DAG.getVectorIdxConstant(NumSubElts, DL)); } else { Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, DAG.getVectorIdxConstant(0, DL)); Hi = SubVec; } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi); } static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait until after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Transform a scalar conversion of a value from a lane extract into a // lane extract of a vector conversion. E.g., from foo1 to foo2: // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } // // The second form interacts better with instruction selection and the // register allocator to avoid cross-class register copies that aren't // coalescable due to a lane reference. // Check the operand and see if it originates from a lane extract. SDValue Op1 = N->getOperand(1); if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // Yep, no additional predication needed. Perform the transform. SDValue IID = N->getOperand(0); SDValue Shift = N->getOperand(2); SDValue Vec = Op1.getOperand(0); SDValue Lane = Op1.getOperand(1); EVT ResTy = N->getValueType(0); EVT VecResTy; SDLoc DL(N); // The vector width should be 128 bits by the time we get here, even // if it started as 64 bits (the extract_vector handling will have // done so). Bail if it is not. if (Vec.getValueSizeInBits() != 128) return SDValue(); if (Vec.getValueType() == MVT::v4i32) VecResTy = MVT::v4f32; else if (Vec.getValueType() == MVT::v2i64) VecResTy = MVT::v2f64; else return SDValue(); SDValue Convert = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); } // AArch64 high-vector "long" operations are formed by performing the non-high // version on an extract_subvector of each operand which gets the high half: // // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) // // However, there are cases which don't have an extract_high explicitly, but // have another operation that can be made compatible with one for free. For // example: // // (dupv64 scalar) --> (extract_high (dup128 scalar)) // // This routine does the actual conversion of such DUPs, once outer routines // have determined that everything else is in order. // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold // similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { MVT VT = N.getSimpleValueType(); if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR && N.getConstantOperandVal(1) == 0) N = N.getOperand(0); switch (N.getOpcode()) { case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: case AArch64ISD::MOVI: case AArch64ISD::MOVIshift: case AArch64ISD::MOVIedit: case AArch64ISD::MOVImsl: case AArch64ISD::MVNIshift: case AArch64ISD::MVNImsl: break; default: // FMOV could be supported, but isn't very useful, as it would only occur // if you passed a bitcast' floating point immediate to an eligible long // integer op (addl, smull, ...). return SDValue(); } if (!VT.is64BitVector()) return SDValue(); SDLoc DL(N); unsigned NumElems = VT.getVectorNumElements(); if (N.getValueType().is64BitVector()) { MVT ElementTy = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops()); } return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N, DAG.getConstant(NumElems, DL, MVT::i64)); } static bool isEssentiallyExtractHighSubvector(SDValue N) { if (N.getOpcode() == ISD::BITCAST) N = N.getOperand(0); if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; if (N.getOperand(0).getValueType().isScalableVector()) return false; return cast(N.getOperand(1))->getAPIntValue() == N.getOperand(0).getValueType().getVectorNumElements() / 2; } /// Helper structure to keep track of ISD::SET_CC operands. struct GenericSetCCInfo { const SDValue *Opnd0; const SDValue *Opnd1; ISD::CondCode CC; }; /// Helper structure to keep track of a SET_CC lowered into AArch64 code. struct AArch64SetCCInfo { const SDValue *Cmp; AArch64CC::CondCode CC; }; /// Helper structure to keep track of SetCC information. union SetCCInfo { GenericSetCCInfo Generic; AArch64SetCCInfo AArch64; }; /// Helper structure to be able to read SetCC information. If set to /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a /// GenericSetCCInfo. struct SetCCInfoAndKind { SetCCInfo Info; bool IsAArch64; }; /// Check whether or not \p Op is a SET_CC operation, either a generic or /// an /// AArch64 lowered one. /// \p SetCCInfo is filled accordingly. /// \post SetCCInfo is meanginfull only when this function returns true. /// \return True when Op is a kind of SET_CC operation. static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { // If this is a setcc, this is straight forward. if (Op.getOpcode() == ISD::SETCC) { SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); SetCCInfo.Info.Generic.CC = cast(Op.getOperand(2))->get(); SetCCInfo.IsAArch64 = false; return true; } // Otherwise, check if this is a matching csel instruction. // In other words: // - csel 1, 0, cc // - csel 0, 1, !cc if (Op.getOpcode() != AArch64ISD::CSEL) return false; // Set the information about the operands. // TODO: we want the operands of the Cmp not the csel SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); SetCCInfo.IsAArch64 = true; SetCCInfo.Info.AArch64.CC = static_cast( cast(Op.getOperand(2))->getZExtValue()); // Check that the operands matches the constraints: // (1) Both operands must be constants. // (2) One must be 1 and the other must be 0. ConstantSDNode *TValue = dyn_cast(Op.getOperand(0)); ConstantSDNode *FValue = dyn_cast(Op.getOperand(1)); // Check (1). if (!TValue || !FValue) return false; // Check (2). if (!TValue->isOne()) { // Update the comparison when we are interested in !cc. std::swap(TValue, FValue); SetCCInfo.Info.AArch64.CC = AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); } return TValue->isOne() && FValue->isZero(); } // Returns true if Op is setcc or zext of setcc. static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { if (isSetCC(Op, Info)) return true; return ((Op.getOpcode() == ISD::ZERO_EXTEND) && isSetCC(Op->getOperand(0), Info)); } // The folding we want to perform is: // (add x, [zext] (setcc cc ...) ) // --> // (csel x, (add x, 1), !cc ...) // // The latter will get matched to a CSINC instruction. static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); SDValue LHS = Op->getOperand(0); SDValue RHS = Op->getOperand(1); SetCCInfoAndKind InfoAndKind; // If both operands are a SET_CC, then we don't want to perform this // folding and create another csel as this results in more instructions // (and higher register usage). if (isSetCCOrZExtSetCC(LHS, InfoAndKind) && isSetCCOrZExtSetCC(RHS, InfoAndKind)) return SDValue(); // If neither operand is a SET_CC, give up. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { std::swap(LHS, RHS); if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) return SDValue(); } // FIXME: This could be generatized to work for FP comparisons. EVT CmpVT = InfoAndKind.IsAArch64 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() : InfoAndKind.Info.Generic.Opnd0->getValueType(); if (CmpVT != MVT::i32 && CmpVT != MVT::i64) return SDValue(); SDValue CCVal; SDValue Cmp; SDLoc dl(Op); if (InfoAndKind.IsAArch64) { CCVal = DAG.getConstant( AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, MVT::i32); Cmp = *InfoAndKind.Info.AArch64.Cmp; } else Cmp = getAArch64Cmp( *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG, dl); EVT VT = Op->getValueType(0); LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); // Only scalar integer and vector types. if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) return SDValue(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) return SDValue(); auto *LHSN1 = dyn_cast(LHS->getOperand(1)); auto *RHSN1 = dyn_cast(RHS->getOperand(1)); if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero()) return SDValue(); SDValue Op1 = LHS->getOperand(0); SDValue Op2 = RHS->getOperand(0); EVT OpVT1 = Op1.getValueType(); EVT OpVT2 = Op2.getValueType(); if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || Op2.getOpcode() != AArch64ISD::UADDV || OpVT1.getVectorElementType() != VT) return SDValue(); SDValue Val1 = Op1.getOperand(0); SDValue Val2 = Op2.getOperand(0); EVT ValVT = Val1->getValueType(0); SDLoc DL(N); SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), DAG.getConstant(0, DL, MVT::i64)); } /// Perform the scalar expression combine in the form of: /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc) /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc) static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD) return SDValue(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Handle commutivity. if (LHS.getOpcode() != AArch64ISD::CSEL && LHS.getOpcode() != AArch64ISD::CSNEG) { std::swap(LHS, RHS); if (LHS.getOpcode() != AArch64ISD::CSEL && LHS.getOpcode() != AArch64ISD::CSNEG) { return SDValue(); } } if (!LHS.hasOneUse()) return SDValue(); AArch64CC::CondCode AArch64CC = static_cast(LHS.getConstantOperandVal(2)); // The CSEL should include a const one operand, and the CSNEG should include // One or NegOne operand. ConstantSDNode *CTVal = dyn_cast(LHS.getOperand(0)); ConstantSDNode *CFVal = dyn_cast(LHS.getOperand(1)); if (!CTVal || !CFVal) return SDValue(); if (!(LHS.getOpcode() == AArch64ISD::CSEL && (CTVal->isOne() || CFVal->isOne())) && !(LHS.getOpcode() == AArch64ISD::CSNEG && (CTVal->isOne() || CFVal->isAllOnes()))) return SDValue(); // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc) if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() && !CFVal->isOne()) { std::swap(CTVal, CFVal); AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } SDLoc DL(N); // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc) if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() && !CFVal->isAllOnes()) { APInt C = -1 * CFVal->getAPIntValue(); CTVal = cast(DAG.getConstant(C, DL, VT)); CFVal = cast(DAG.getAllOnesConstant(DL, VT)); AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } // It might be neutral for larger constants, as the immediate need to be // materialized in a register. APInt ADDC = CTVal->getAPIntValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) return SDValue(); assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) || (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) && "Unexpected constant value"); SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0)); SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32); SDValue Cmp = LHS.getOperand(3); return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp); } // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y) static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (N->getOpcode() != ISD::ADD) return SDValue(); SDValue Dot = N->getOperand(0); SDValue A = N->getOperand(1); // Handle commutivity auto isZeroDot = [](SDValue Dot) { return (Dot.getOpcode() == AArch64ISD::UDOT || Dot.getOpcode() == AArch64ISD::SDOT) && isZerosVector(Dot.getOperand(0).getNode()); }; if (!isZeroDot(Dot)) std::swap(Dot, A); if (!isZeroDot(Dot)) return SDValue(); return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1), Dot.getOperand(2)); } static bool isNegatedInteger(SDValue Op) { return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)); } static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::SUB, DL, VT, Zero, Op); } // Try to fold // // (neg (csel X, Y)) -> (csel (neg X), (neg Y)) // // The folding helps csel to be matched with csneg without generating // redundant neg instruction, which includes negation of the csel expansion // of abs node lowered by lowerABS. static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) { if (!isNegatedInteger(SDValue(N, 0))) return SDValue(); SDValue CSel = N->getOperand(1); if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse()) return SDValue(); SDValue N0 = CSel.getOperand(0); SDValue N1 = CSel.getOperand(1); // If both of them is not negations, it's not worth the folding as it // introduces two additional negations while reducing one negation. if (!isNegatedInteger(N0) && !isNegatedInteger(N1)) return SDValue(); SDValue N0N = getNegatedInteger(N0, DAG); SDValue N1N = getNegatedInteger(N1, DAG); SDLoc DL(N); EVT VT = CSel.getValueType(); return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2), CSel.getOperand(3)); } // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: // // (add (zeroext (extract_high LHS)), // (zeroext (extract_high RHS))) // -> uaddl2 vD, vN, vM // // However, if one of the extracts is something like a duplicate, this // instruction can still be used profitably. This function puts the DAG into a // more appropriate form for those patterns to trigger. static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector()) { if (N->getOpcode() == ISD::ADD) return performSetccAddFolding(N, DAG); return SDValue(); } // Make sure both branches are extended in the same way. SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if ((LHS.getOpcode() != ISD::ZERO_EXTEND && LHS.getOpcode() != ISD::SIGN_EXTEND) || LHS.getOpcode() != RHS.getOpcode()) return SDValue(); unsigned ExtType = LHS.getOpcode(); // It's not worth doing if at least one of the inputs isn't already an // extract, but we don't know which it'll be so we have to try both. if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); if (!RHS.getNode()) return SDValue(); RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); if (!LHS.getNode()) return SDValue(); LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); } return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } static bool isCMP(SDValue Op) { return Op.getOpcode() == AArch64ISD::SUBS && !Op.getNode()->hasAnyUseOfValue(0); } // (CSEL 1 0 CC Cond) => CC // (CSEL 0 1 CC Cond) => !CC static std::optional getCSETCondCode(SDValue Op) { if (Op.getOpcode() != AArch64ISD::CSEL) return std::nullopt; auto CC = static_cast(Op.getConstantOperandVal(2)); if (CC == AArch64CC::AL || CC == AArch64CC::NV) return std::nullopt; SDValue OpLHS = Op.getOperand(0); SDValue OpRHS = Op.getOperand(1); if (isOneConstant(OpLHS) && isNullConstant(OpRHS)) return CC; if (isNullConstant(OpLHS) && isOneConstant(OpRHS)) return getInvertedCondCode(CC); return std::nullopt; } // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry) // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry) static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) { SDValue CmpOp = Op->getOperand(2); if (!isCMP(CmpOp)) return SDValue(); if (IsAdd) { if (!isOneConstant(CmpOp.getOperand(1))) return SDValue(); } else { if (!isNullConstant(CmpOp.getOperand(0))) return SDValue(); } SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1); auto CC = getCSETCondCode(CsetOp); if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO)) return SDValue(); return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(), Op->getOperand(0), Op->getOperand(1), CsetOp.getOperand(3)); } // (ADC x 0 cond) => (CINC x HS cond) static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Cond = N->getOperand(2); if (!isNullConstant(RHS)) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); // (CINC x cc cond) <=> (CSINC x x !cc cond) SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); } // Transform vector add(zext i8 to i32, zext i8 to i32) // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) // This allows extra uses of saddl/uaddl at the lower vector widths, and less // extends. static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || N->getOperand(0).getOperand(0).getValueType() != N->getOperand(1).getOperand(0).getValueType()) return SDValue(); SDValue N0 = N->getOperand(0).getOperand(0); SDValue N1 = N->getOperand(1).getOperand(0); EVT InVT = N0.getValueType(); EVT S1 = InVT.getScalarType(); EVT S2 = VT.getScalarType(); if ((S2 == MVT::i32 && S1 == MVT::i8) || (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { SDLoc DL(N); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), S2.getHalfSizedIntegerVT(*DAG.getContext()), VT.getVectorElementCount()); SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp); } return SDValue(); } static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); // A build vector of two extracted elements is equivalent to an // extract subvector where the inner vector is any-extended to the // extract_vector_elt VT. // (build_vector (extract_elt_iXX_to_i32 vec Idx+0) // (extract_elt_iXX_to_i32 vec Idx+1)) // => (extract_subvector (anyext_iXX_to_i32 vec) Idx) // For now, only consider the v2i32 case, which arises as a result of // legalization. if (VT != MVT::v2i32) return SDValue(); SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1); // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT. if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT && // Constant index. isa(Elt0->getOperand(1)) && isa(Elt1->getOperand(1)) && // Both EXTRACT_VECTOR_ELT from same vector... Elt0->getOperand(0) == Elt1->getOperand(0) && // ... and contiguous. First element's index +1 == second element's index. Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) && // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of // ResultType's known minimum vector length. Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) { SDValue VecToExtend = Elt0->getOperand(0); EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32); if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT)) return SDValue(); SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL); SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext, SubvectorIdx); } return SDValue(); } static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() && N0.getOpcode() == AArch64ISD::DUP) { SDValue Op = N0.getOperand(0); if (VT.getScalarType() == MVT::i32 && N0.getOperand(0).getValueType().getScalarType() == MVT::i64) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op); } return SDValue(); } // Check an node is an extend or shift operand static bool isExtendOrShiftOperand(SDValue N) { unsigned Opcode = N.getOpcode(); if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) { EVT SrcVT; if (Opcode == ISD::SIGN_EXTEND_INREG) SrcVT = cast(N.getOperand(1))->getVT(); else SrcVT = N.getOperand(0).getValueType(); return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8; } else if (Opcode == ISD::AND) { ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); if (!CSD) return false; uint64_t AndMask = CSD->getZExtValue(); return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff; } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) { return isa(N.getOperand(1)); } return false; } // (N - Y) + Z --> (Z - Y) + N // when N is an extend or shift operand static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG) { auto IsOneUseExtend = [](SDValue N) { return N.hasOneUse() && isExtendOrShiftOperand(N); }; // DAGCombiner will revert the combination when Z is constant cause // dead loop. So don't enable the combination when Z is constant. // If Z is one use shift C, we also can't do the optimization. // It will falling to self infinite loop. if (isa(Z) || IsOneUseExtend(Z)) return SDValue(); if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse()) return SDValue(); SDValue Shift = SUB.getOperand(0); if (!IsOneUseExtend(Shift)) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Y = SUB.getOperand(1); SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y); return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift); } static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG) { // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not // commutative. if (N->getOpcode() != ISD::ADD) return SDValue(); // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with // shifted register is only available for i32 and i64. EVT VT = N->getValueType(0); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG)) return Val; if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG)) return Val; uint64_t LHSImm = 0, RHSImm = 0; // If both operand are shifted by imm and shift amount is not greater than 4 // for one operand, swap LHS and RHS to put operand with smaller shift amount // on RHS. // // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD // with LSL (shift > 4). For the rest of processors, this is no-op for // performance or correctness. if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) && isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 && RHSImm > 4 && LHS.hasOneUse()) return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS); return SDValue(); } // The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2)) // This reassociates it back to allow the creation of more mls instructions. static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() != ISD::SUB) return SDValue(); SDValue Add = N->getOperand(1); SDValue X = N->getOperand(0); if (Add.getOpcode() != ISD::ADD) return SDValue(); if (!Add.hasOneUse()) return SDValue(); if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X))) return SDValue(); SDValue M1 = Add.getOperand(0); SDValue M2 = Add.getOperand(1); if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL && M1.getOpcode() != AArch64ISD::UMULL) return SDValue(); if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL && M2.getOpcode() != AArch64ISD::UMULL) return SDValue(); EVT VT = N->getValueType(0); SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1); return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2); } // Combine into mla/mls. // This works on the patterns of: // add v1, (mul v2, v3) // sub v1, (mul v2, v3) // for vectors of type <1 x i64> and <2 x i64> when SVE is available. // It will transform the add/sub to a scalable version, so that we can // make use of SVE's MLA/MLS that will be generated for that pattern static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; // Make sure that the types are legal if (!DCI.isAfterLegalizeDAG()) return SDValue(); // Before using SVE's features, check first if it's available. if (!DAG.getSubtarget().hasSVE()) return SDValue(); if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB) return SDValue(); if (!N->getValueType(0).isFixedLengthVector()) return SDValue(); auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue { if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR) return SDValue(); if (!cast(Op1->getOperand(1))->isZero()) return SDValue(); SDValue MulValue = Op1->getOperand(0); if (MulValue.getOpcode() != AArch64ISD::MUL_PRED) return SDValue(); if (!Op1.hasOneUse() || !MulValue.hasOneUse()) return SDValue(); EVT ScalableVT = MulValue.getValueType(); if (!ScalableVT.isScalableVector()) return SDValue(); SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0); SDValue NewValue = DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue}); return convertFromScalableVector(DAG, N->getValueType(0), NewValue); }; if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1))) return res; else if (N->getOpcode() == ISD::ADD) return performOpt(N->getOperand(1), N->getOperand(0)); return SDValue(); } // Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can // help, for example, to produce ssra from sshr+add. static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::i64) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // At least one of the operands should be an extract, and the other should be // something that is easy to convert to v1i64 type (in this case a load). if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT && Op0.getOpcode() != ISD::LOAD) return SDValue(); if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() != ISD::LOAD) return SDValue(); SDLoc DL(N); if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0).getValueType() == MVT::v1i64) { Op0 = Op0.getOperand(0); Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1); } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOperand(0).getValueType() == MVT::v1i64) { Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0); Op1 = Op1.getOperand(0); } else return SDValue(); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1), DAG.getConstant(0, DL, MVT::i64)); } static bool isLoadOrMultipleLoads(SDValue B, SmallVector &Loads) { SDValue BV = peekThroughOneUseBitcasts(B); if (!BV->hasOneUse()) return false; if (auto *Ld = dyn_cast(BV)) { if (!Ld || !Ld->isSimple()) return false; Loads.push_back(Ld); return true; } else if (BV.getOpcode() == ISD::BUILD_VECTOR || BV.getOpcode() == ISD::CONCAT_VECTORS) { for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) { auto *Ld = dyn_cast(BV.getOperand(Op)); if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse()) return false; Loads.push_back(Ld); } return true; } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) { // Try to find a tree of shuffles and concats from how IR shuffles of loads // are lowered. Note that this only comes up because we do not always visit // operands before uses. After that is fixed this can be removed and in the // meantime this is fairly specific to the lowering we expect from IR. // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE || B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS || B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || B.getOperand(1).getNumOperands() != 4) return false; auto SV1 = cast(B); auto SV2 = cast(B.getOperand(0)); int NumElts = B.getValueType().getVectorNumElements(); int NumSubElts = NumElts / 4; for (int I = 0; I < NumSubElts; I++) { // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> if (SV1->getMaskElt(I) != I || SV1->getMaskElt(I + NumSubElts) != I + NumSubElts || SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 || SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts) return false; // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> if (SV2->getMaskElt(I) != I || SV2->getMaskElt(I + NumSubElts) != I + NumSubElts || SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts) return false; } auto *Ld0 = dyn_cast(SV2->getOperand(0).getOperand(0)); auto *Ld1 = dyn_cast(SV2->getOperand(0).getOperand(1)); auto *Ld2 = dyn_cast(SV2->getOperand(1).getOperand(0)); auto *Ld3 = dyn_cast(B.getOperand(1).getOperand(0)); if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() || !Ld2->isSimple() || !Ld3->isSimple()) return false; Loads.push_back(Ld0); Loads.push_back(Ld1); Loads.push_back(Ld2); Loads.push_back(Ld3); return true; } return false; } static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads) { if (!Op0.hasOneUse() || !Op1.hasOneUse()) return false; SmallVector Loads0, Loads1; if (isLoadOrMultipleLoads(Op0, Loads0) && isLoadOrMultipleLoads(Op1, Loads1)) { if (NumSubLoads && Loads0.size() != NumSubLoads) return false; NumSubLoads = Loads0.size(); return Loads0.size() == Loads1.size() && all_of(zip(Loads0, Loads1), [&DAG](auto L) { unsigned Size = get<0>(L)->getValueType(0).getSizeInBits(); return Size == get<1>(L)->getValueType(0).getSizeInBits() && DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L), Size / 8, 1); }); } if (Op0.getOpcode() != Op1.getOpcode()) return false; switch (Op0.getOpcode()) { case ISD::ADD: case ISD::SUB: return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), DAG, NumSubLoads) && areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1), DAG, NumSubLoads); case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: EVT XVT = Op0.getOperand(0).getValueType(); if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 && XVT.getScalarSizeInBits() != 32) return false; return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), DAG, NumSubLoads); } return false; } // This method attempts to fold trees of add(ext(load p), shl(ext(load p+4)) // into a single load of twice the size, that we extract the bottom part and top // part so that the shl can use a shll2 instruction. The two loads in that // example can also be larger trees of instructions, which are identical except // for the leaves which are all loads offset from the LHS, including // buildvectors of multiple loads. For example the RHS tree could be // sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4))) // Whilst it can be common for the larger loads to replace LDP instructions // (which doesn't gain anything on it's own), the larger loads can help create // more efficient code, and in buildvectors prevent the need for ld1 lane // inserts which can be slower than normal loads. static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isFixedLengthVector() || (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 && VT.getScalarSizeInBits() != 64)) return SDValue(); SDValue Other = N->getOperand(0); SDValue Shift = N->getOperand(1); if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB) std::swap(Shift, Other); APInt ShiftAmt; if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() || !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt)) return SDValue(); if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) || !ISD::isExtOpcode(Other.getOpcode()) || Shift.getOperand(0).getOperand(0).getValueType() != Other.getOperand(0).getValueType() || !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse()) return SDValue(); SDValue Op0 = Other.getOperand(0); SDValue Op1 = Shift.getOperand(0).getOperand(0); unsigned NumSubLoads = 0; if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads)) return SDValue(); // Attempt to rule out some unprofitable cases using heuristics (some working // around suboptimal code generation), notably if the extend not be able to // use ushll2 instructions as the types are not large enough. Otherwise zip's // will need to be created which can increase the instruction count. unsigned NumElts = Op0.getValueType().getVectorNumElements(); unsigned NumSubElts = NumElts / NumSubLoads; if (NumSubElts * VT.getScalarSizeInBits() < 128 || (Other.getOpcode() != Shift.getOperand(0).getOpcode() && Op0.getValueType().getSizeInBits() < 128 && !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType()))) return SDValue(); // Recreate the tree with the new combined loads. std::function GenCombinedTree = [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) { EVT DVT = Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext()); SmallVector Loads0, Loads1; if (isLoadOrMultipleLoads(Op0, Loads0) && isLoadOrMultipleLoads(Op1, Loads1)) { EVT LoadVT = EVT::getVectorVT( *DAG.getContext(), Op0.getValueType().getScalarType(), Op0.getValueType().getVectorNumElements() / Loads0.size()); EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext()); SmallVector NewLoads; for (const auto &[L0, L1] : zip(Loads0, Loads1)) { SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(), L0->getBasePtr(), L0->getPointerInfo(), L0->getOriginalAlign()); DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1)); DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1)); NewLoads.push_back(Load); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads); } SmallVector Ops; for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values())) Ops.push_back(GenCombinedTree(O0, O1, DAG)); return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops); }; SDValue NewOp = GenCombinedTree(Op0, Op1, DAG); SmallVector LowMask(NumElts, 0), HighMask(NumElts, 0); int Hi = NumSubElts, Lo = 0; for (unsigned i = 0; i < NumSubLoads; i++) { for (unsigned j = 0; j < NumSubElts; j++) { LowMask[i * NumSubElts + j] = Lo++; HighMask[i * NumSubElts + j] = Hi++; } Lo += NumSubElts; Hi += NumSubElts; } SDLoc DL(N); SDValue Ext0, Ext1; // Extract the top and bottom lanes, then extend the result. Possibly extend // the result then extract the lanes if the two operands match as it produces // slightly smaller code. if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) { SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp, DAG.getConstant(0, DL, MVT::i64)); SDValue SubH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp, DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); SDValue Extr0 = DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask); SDValue Extr1 = DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask); Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0); Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1); } else { EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp); SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, DAG.getConstant(0, DL, MVT::i64)); SDValue SubH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask); Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask); } SDValue NShift = DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1)); return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); } static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. if (SDValue Val = performAddUADDVCombine(N, DCI.DAG)) return Val; if (SDValue Val = performAddDotCombine(N, DCI.DAG)) return Val; if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG)) return Val; if (SDValue Val = performNegCSelCombine(N, DCI.DAG)) return Val; if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG)) return Val; if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG)) return Val; if (SDValue Val = performSubAddMULCombine(N, DCI.DAG)) return Val; if (SDValue Val = performSVEMulAddSubCombine(N, DCI)) return Val; if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) return Val; if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) return Val; return performAddSubLongCombine(N, DCI); } // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> // (aarch64_neon_umull (extract_high (v2i64 vec))) // (extract_high (v2i64 (dup128 scalar))))) // static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1); SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2); assert(LHS.getValueType().is64BitVector() && RHS.getValueType().is64BitVector() && "unexpected shape for long operation"); // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". if (isEssentiallyExtractHighSubvector(LHS)) { RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); } else if (isEssentiallyExtractHighSubvector(RHS)) { LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); } else return SDValue(); if (IID == Intrinsic::not_intrinsic) return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), N->getOperand(0), LHS, RHS); } static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { MVT ElemTy = N->getSimpleValueType(0).getScalarType(); unsigned ElemBits = ElemTy.getSizeInBits(); int64_t ShiftAmount; if (BuildVectorSDNode *BVN = dyn_cast(N->getOperand(2))) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, ElemBits) || SplatBitSize != ElemBits) return SDValue(); ShiftAmount = SplatValue.getSExtValue(); } else if (ConstantSDNode *CVN = dyn_cast(N->getOperand(2))) { ShiftAmount = CVN->getSExtValue(); } else return SDValue(); // If the shift amount is zero, remove the shift intrinsic. if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu) return N->getOperand(1); unsigned Opcode; bool IsRightShift; switch (IID) { default: llvm_unreachable("Unknown shift intrinsic"); case Intrinsic::aarch64_neon_sqshl: Opcode = AArch64ISD::SQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_uqshl: Opcode = AArch64ISD::UQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_srshl: Opcode = AArch64ISD::SRSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_urshl: Opcode = AArch64ISD::URSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_sqshlu: Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_sshl: case Intrinsic::aarch64_neon_ushl: // For positive shift amounts we can use SHL, as ushl/sshl perform a regular // left shift for positive shift amounts. Below, we only replace the current // node with VSHL, if this condition is met. Opcode = AArch64ISD::VSHL; IsRightShift = false; break; } EVT VT = N->getValueType(0); SDValue Op = N->getOperand(1); SDLoc dl(N); if (VT == MVT::i64) { Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op); VT = MVT::v1i64; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { Op = DAG.getNode(Opcode, dl, VT, Op, DAG.getConstant(-ShiftAmount, dl, MVT::i32)); if (N->getValueType(0) == MVT::i64) Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, DAG.getConstant(0, dl, MVT::i64)); return Op; } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { Op = DAG.getNode(Opcode, dl, VT, Op, DAG.getConstant(ShiftAmount, dl, MVT::i32)); if (N->getValueType(0) == MVT::i64) Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, DAG.getConstant(0, dl, MVT::i64)); return Op; } return SDValue(); } // The CRC32[BH] instructions ignore the high bits of their data operand. Since // the intrinsics must be legal and take an i32, this means there's almost // certainly going to be a zext in the DAG which we can eliminate. static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { SDValue AndN = N->getOperand(2); if (AndN.getOpcode() != ISD::AND) return SDValue(); ConstantSDNode *CMask = dyn_cast(AndN.getOperand(1)); if (!CMask || CMask->getZExtValue() != Mask) return SDValue(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); } static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), DAG.getNode(Opc, dl, N->getOperand(1).getSimpleValueType(), N->getOperand(1)), DAG.getConstant(0, dl, MVT::i64)); } static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); EVT ScalarTy = Op2.getValueType(); if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) ScalarTy = MVT::i32; // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base). SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0)); SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2); SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step); SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1); return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base); } static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); SDValue Scalar = N->getOperand(3); EVT ScalarTy = Scalar.getValueType(); if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); SDValue Passthru = N->getOperand(1); SDValue Pred = N->getOperand(2); return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), Pred, Scalar, Passthru); } static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); LLVMContext &Ctx = *DAG.getContext(); EVT VT = N->getValueType(0); assert(VT.isScalableVector() && "Expected a scalable vector."); // Current lowering only supports the SVE-ACLE types. if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) return SDValue(); unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8; EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize)); // Convert everything to the domain of EXT (i.e bytes). SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2)); SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3), DAG.getConstant(ElemSize, dl, MVT::i32)); SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2); return DAG.getNode(ISD::BITCAST, dl, VT, EXT); } static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize()) return SDValue(); SDValue Comparator = N->getOperand(3); if (Comparator.getOpcode() == AArch64ISD::DUP || Comparator.getOpcode() == ISD::SPLAT_VECTOR) { unsigned IID = getIntrinsicID(N); EVT VT = N->getValueType(0); EVT CmpVT = N->getOperand(2).getValueType(); SDValue Pred = N->getOperand(1); SDValue Imm; SDLoc DL(N); switch (IID) { default: llvm_unreachable("Called with wrong intrinsic!"); break; // Signed comparisons case Intrinsic::aarch64_sve_cmpeq_wide: case Intrinsic::aarch64_sve_cmpne_wide: case Intrinsic::aarch64_sve_cmpge_wide: case Intrinsic::aarch64_sve_cmpgt_wide: case Intrinsic::aarch64_sve_cmplt_wide: case Intrinsic::aarch64_sve_cmple_wide: { if (auto *CN = dyn_cast(Comparator.getOperand(0))) { int64_t ImmVal = CN->getSExtValue(); if (ImmVal >= -16 && ImmVal <= 15) Imm = DAG.getConstant(ImmVal, DL, MVT::i32); else return SDValue(); } break; } // Unsigned comparisons case Intrinsic::aarch64_sve_cmphs_wide: case Intrinsic::aarch64_sve_cmphi_wide: case Intrinsic::aarch64_sve_cmplo_wide: case Intrinsic::aarch64_sve_cmpls_wide: { if (auto *CN = dyn_cast(Comparator.getOperand(0))) { uint64_t ImmVal = CN->getZExtValue(); if (ImmVal <= 127) Imm = DAG.getConstant(ImmVal, DL, MVT::i32); else return SDValue(); } break; } } if (!Imm) return SDValue(); SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, N->getOperand(2), Splat, DAG.getCondCode(CC)); } return SDValue(); } static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(Op); assert(Op.getValueType().isScalableVector() && TLI.isTypeLegal(Op.getValueType()) && "Expected legal scalable vector type!"); assert(Op.getValueType() == Pg.getValueType() && "Expected same type for PTEST operands"); // Ensure target specific opcodes are using legal type. EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue TVal = DAG.getConstant(1, DL, OutVT); SDValue FVal = DAG.getConstant(0, DL, OutVT); // Ensure operands have type nxv16i1. if (Op.getValueType() != MVT::nxv16i1) { if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) && isZeroingInactiveLanes(Op)) Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg); else Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG); Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); } // Set condition code (CC) flags. SDValue Test = DAG.getNode( Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST, DL, MVT::Other, Pg, Op); // Convert CC to integer based on requested condition. // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); return DAG.getZExtOrTrunc(Res, DL, VT); } static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG) { SDLoc DL(N); SDValue Pred = N->getOperand(1); SDValue VecToReduce = N->getOperand(2); // NOTE: The integer reduction's result type is not always linked to the // operand's element type so we construct it from the intrinsic's result type. EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0)); SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); // SVE reductions set the whole vector register with the first element // containing the reduction result, which we'll now extract. SDValue Zero = DAG.getConstant(0, DL, MVT::i64); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, Zero); } static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG) { SDLoc DL(N); SDValue Pred = N->getOperand(1); SDValue VecToReduce = N->getOperand(2); EVT ReduceVT = VecToReduce.getValueType(); SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); // SVE reductions set the whole vector register with the first element // containing the reduction result, which we'll now extract. SDValue Zero = DAG.getConstant(0, DL, MVT::i64); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, Zero); } static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG) { SDLoc DL(N); SDValue Pred = N->getOperand(1); SDValue InitVal = N->getOperand(2); SDValue VecToReduce = N->getOperand(3); EVT ReduceVT = VecToReduce.getValueType(); // Ordered reductions use the first lane of the result vector as the // reduction's initial value. SDValue Zero = DAG.getConstant(0, DL, MVT::i64); InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, DAG.getUNDEF(ReduceVT), InitVal, Zero); SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); // SVE reductions set the whole vector register with the first element // containing the reduction result, which we'll now extract. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, Zero); } // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp = false, bool SwapOperands = false) { assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); SDValue Pg = N->getOperand(1); SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2); SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3); // ISD way to specify an all active predicate. if (isAllActivePredicate(DAG, Pg)) { if (UnpredOp) return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2); return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2); } // FUTURE: SplatVector(true) return SDValue(); } static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; unsigned IID = getIntrinsicID(N); switch (IID) { default: break; case Intrinsic::get_active_lane_mask: { SDValue Res = SDValue(); EVT VT = N->getValueType(0); if (VT.isFixedLengthVector()) { // We can use the SVE whilelo instruction to lower this intrinsic by // creating the appropriate sequence of scalable vector operations and // then extracting a fixed-width subvector from the scalable vector. SDLoc DL(N); SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); EVT WhileVT = EVT::getVectorVT( *DAG.getContext(), MVT::i1, ElementCount::getScalable(VT.getVectorNumElements())); // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32. EVT PromVT = getPromotedVTForPredicate(WhileVT); // Get the fixed-width equivalent of PromVT for extraction. EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(), VT.getVectorElementCount()); Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID, N->getOperand(1), N->getOperand(2)); Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res, DAG.getConstant(0, DL, MVT::i64)); Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res); } return Res; } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); case Intrinsic::aarch64_neon_sminv: return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); case Intrinsic::aarch64_neon_uminv: return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); case Intrinsic::aarch64_neon_smaxv: return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmaxnm: return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fminnm: return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_umull: return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_pmull: return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); case Intrinsic::aarch64_neon_sqshl: case Intrinsic::aarch64_neon_uqshl: case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: case Intrinsic::aarch64_neon_sshl: case Intrinsic::aarch64_neon_ushl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_neon_rshrn: { EVT VT = N->getOperand(1).getValueType(); SDLoc DL(N); SDValue Imm = DAG.getConstant(1LLU << (N->getConstantOperandVal(2) - 1), DL, VT); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Imm); SDValue Sht = DAG.getNode(ISD::SRL, DL, VT, Add, DAG.getConstant(N->getConstantOperandVal(2), DL, VT)); return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht); } case Intrinsic::aarch64_neon_sabd: return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_uabd: return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); case Intrinsic::aarch64_crc32h: case Intrinsic::aarch64_crc32ch: return tryCombineCRC32(0xffff, N, DAG); case Intrinsic::aarch64_sve_saddv: // There is no i64 version of SADDV because the sign is irrelevant. if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); else return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); case Intrinsic::aarch64_sve_uaddv: return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); case Intrinsic::aarch64_sve_smaxv: return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG); case Intrinsic::aarch64_sve_umaxv: return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG); case Intrinsic::aarch64_sve_sminv: return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG); case Intrinsic::aarch64_sve_uminv: return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG); case Intrinsic::aarch64_sve_orv: return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG); case Intrinsic::aarch64_sve_eorv: return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG); case Intrinsic::aarch64_sve_andv: return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG); case Intrinsic::aarch64_sve_index: return LowerSVEIntrinsicIndex(N, DAG); case Intrinsic::aarch64_sve_dup: return LowerSVEIntrinsicDUP(N, DAG); case Intrinsic::aarch64_sve_dup_x: return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); case Intrinsic::aarch64_sve_mul_u: return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_smulh_u: return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_umulh_u: return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_smin_u: return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_umin_u: return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_smax_u: return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_umax_u: return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_lsl_u: return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_lsr_u: return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_asr_u: return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fadd_u: return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fdiv_u: return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fmax_u: return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fmaxnm_u: return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fmla_u: return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(3), N->getOperand(4), N->getOperand(2)); case Intrinsic::aarch64_sve_fmin_u: return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fminnm_u: return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fmul_u: return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_fsub_u: return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_add_u: return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_sub_u: return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_subr: return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true); case Intrinsic::aarch64_sve_and_u: return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_bic_u: return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_eor_u: return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_orr_u: return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_sabd_u: return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_uabd_u: return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_sdiv_u: return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_udiv_u: return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_sqadd: return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true); case Intrinsic::aarch64_sve_sqsub_u: return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_uqadd: return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true); case Intrinsic::aarch64_sve_uqsub_u: return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_sqadd_x: return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_sve_sqsub_x: return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_sve_uqadd_x: return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_sve_uqsub_x: return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_sve_asrd: return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); break; case Intrinsic::aarch64_sve_cmphi: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); break; case Intrinsic::aarch64_sve_fcmpge: case Intrinsic::aarch64_sve_cmpge: return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETGE)); break; case Intrinsic::aarch64_sve_fcmpgt: case Intrinsic::aarch64_sve_cmpgt: return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETGT)); break; case Intrinsic::aarch64_sve_fcmpeq: case Intrinsic::aarch64_sve_cmpeq: return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); break; case Intrinsic::aarch64_sve_fcmpne: case Intrinsic::aarch64_sve_cmpne: return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETNE)); break; case Intrinsic::aarch64_sve_fcmpuo: return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETUO)); break; case Intrinsic::aarch64_sve_fadda: return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); case Intrinsic::aarch64_sve_faddv: return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); case Intrinsic::aarch64_sve_fmaxnmv: return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); case Intrinsic::aarch64_sve_fmaxv: return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); case Intrinsic::aarch64_sve_fminnmv: return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); case Intrinsic::aarch64_sve_fminv: return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); case Intrinsic::aarch64_sve_sel: return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_cmpeq_wide: return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); case Intrinsic::aarch64_sve_cmpne_wide: return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); case Intrinsic::aarch64_sve_cmpge_wide: return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); case Intrinsic::aarch64_sve_cmpgt_wide: return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplt_wide: return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); case Intrinsic::aarch64_sve_cmple_wide: return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); case Intrinsic::aarch64_sve_cmphs_wide: return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); case Intrinsic::aarch64_sve_cmphi_wide: return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); case Intrinsic::aarch64_sve_cmplo_wide: return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); case Intrinsic::aarch64_sve_cmpls_wide: return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); case Intrinsic::aarch64_sve_ptest_any: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::ANY_ACTIVE); case Intrinsic::aarch64_sve_ptest_first: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::FIRST_ACTIVE); case Intrinsic::aarch64_sve_ptest_last: return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::LAST_ACTIVE); } return SDValue(); } static bool isCheapToExtend(const SDValue &N) { unsigned OC = N->getOpcode(); return OC == ISD::LOAD || OC == ISD::MLOAD || ISD::isConstantSplatVectorAllZeros(N.getNode()); } static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // If we have (sext (setcc A B)) and A and B are cheap to extend, // we can move the sext into the arguments and have the same result. For // example, if A and B are both loads, we can make those extending loads and // avoid an extra instruction. This pattern appears often in VLS code // generation where the inputs to the setcc have a different size to the // instruction that wants to use the result of the setcc. assert(N->getOpcode() == ISD::SIGN_EXTEND && N->getOperand(0)->getOpcode() == ISD::SETCC); const SDValue SetCC = N->getOperand(0); const SDValue CCOp0 = SetCC.getOperand(0); const SDValue CCOp1 = SetCC.getOperand(1); if (!CCOp0->getValueType(0).isInteger() || !CCOp1->getValueType(0).isInteger()) return SDValue(); ISD::CondCode Code = cast(SetCC->getOperand(2).getNode())->get(); ISD::NodeType ExtType = isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; if (isCheapToExtend(SetCC.getOperand(0)) && isCheapToExtend(SetCC.getOperand(1))) { const SDValue Ext1 = DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0); const SDValue Ext2 = DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1); return DAG.getSetCC( SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, cast(SetCC->getOperand(2).getNode())->get()); } return SDValue(); } static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then // we can convert that DUP into another extract_high (of a bigger DUP), which // helps the backend to decide that an sabdl2 would be useful, saving a real // extract_high operation. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && (N->getOperand(0).getOpcode() == ISD::ABDU || N->getOperand(0).getOpcode() == ISD::ABDS)) { SDNode *ABDNode = N->getOperand(0).getNode(); SDValue NewABD = tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG); if (!NewABD.getNode()) return SDValue(); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } if (N->getValueType(0).isFixedLengthVector() && N->getOpcode() == ISD::SIGN_EXTEND && N->getOperand(0)->getOpcode() == ISD::SETCC) return performSignExtendSetCCCombine(N, DCI, DAG); return SDValue(); } static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts) { assert(!St.isTruncatingStore() && "cannot split truncating vector store"); Align OrigAlignment = St.getAlign(); unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; // Create scalar stores. This is at least as good as the code sequence for a // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); uint64_t BaseOffset = 0; const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); // As this in ISel, we will not merge this add which may degrade results. if (BasePtr->getOpcode() == ISD::ADD && isa(BasePtr->getOperand(1))) { BaseOffset = cast(BasePtr->getOperand(1))->getSExtValue(); BasePtr = BasePtr->getOperand(0); } unsigned Offset = EltOffset; while (--NumVecElts) { Align Alignment = commonAlignment(OrigAlignment, Offset); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); Offset += EltOffset; } return NewST1; } // Returns an SVE type that ContentTy can be trivially sign or zero extended // into. static MVT getSVEContainerType(EVT ContentTy) { assert(ContentTy.isSimple() && "No SVE containers for extended types"); switch (ContentTy.getSimpleVT().SimpleTy) { default: llvm_unreachable("No known SVE container for this MVT type"); case MVT::nxv2i8: case MVT::nxv2i16: case MVT::nxv2i32: case MVT::nxv2i64: case MVT::nxv2f32: case MVT::nxv2f64: return MVT::nxv2i64; case MVT::nxv4i8: case MVT::nxv4i16: case MVT::nxv4i32: case MVT::nxv4f32: return MVT::nxv4i32; case MVT::nxv8i8: case MVT::nxv8i16: case MVT::nxv8f16: case MVT::nxv8bf16: return MVT::nxv8i16; case MVT::nxv16i8: return MVT::nxv16i8; } } static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { SDLoc DL(N); EVT VT = N->getValueType(0); if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) return SDValue(); EVT ContainerVT = VT; if (ContainerVT.isInteger()) ContainerVT = getSVEContainerType(ContainerVT); SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); SDValue Ops[] = { N->getOperand(0), // Chain N->getOperand(2), // Pg N->getOperand(3), // Base DAG.getValueType(VT) }; SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); SDValue LoadChain = SDValue(Load.getNode(), 1); if (ContainerVT.isInteger() && (VT != ContainerVT)) Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); return DAG.getMergeValues({ Load, LoadChain }, DL); } static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); EVT PtrTy = N->getOperand(3).getValueType(); EVT LoadVT = VT; if (VT.isFloatingPoint()) LoadVT = VT.changeTypeToInteger(); auto *MINode = cast(N); SDValue PassThru = DAG.getConstant(0, DL, LoadVT); SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), MINode->getOperand(3), DAG.getUNDEF(PtrTy), MINode->getOperand(2), PassThru, MINode->getMemoryVT(), MINode->getMemOperand(), ISD::UNINDEXED, ISD::NON_EXTLOAD, false); if (VT.isFloatingPoint()) { SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; return DAG.getMergeValues(Ops, DL); } return L; } template static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || Opcode == AArch64ISD::LD1RO_MERGE_ZERO, "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); EVT LoadVT = VT; if (VT.isFloatingPoint()) LoadVT = VT.changeTypeToInteger(); SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); SDValue LoadChain = SDValue(Load.getNode(), 1); if (VT.isFloatingPoint()) Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); return DAG.getMergeValues({Load, LoadChain}, DL); } static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Data = N->getOperand(2); EVT DataVT = Data.getValueType(); EVT HwSrcVt = getSVEContainerType(DataVT); SDValue InputVT = DAG.getValueType(DataVT); if (DataVT.isFloatingPoint()) InputVT = DAG.getValueType(HwSrcVt); SDValue SrcNew; if (Data.getValueType().isFloatingPoint()) SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); else SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); SDValue Ops[] = { N->getOperand(0), // Chain SrcNew, N->getOperand(4), // Base N->getOperand(3), // Pg InputVT }; return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); } static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Data = N->getOperand(2); EVT DataVT = Data.getValueType(); EVT PtrTy = N->getOperand(4).getValueType(); if (DataVT.isFloatingPoint()) Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); auto *MINode = cast(N); return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), DAG.getUNDEF(PtrTy), MINode->getOperand(3), MINode->getMemoryVT(), MINode->getMemOperand(), ISD::UNINDEXED, false, false); } /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store /// if the zero constant is not re-used, since one instructions and one register /// live range will be removed. /// /// For example, the final generated code should be: /// /// stp xzr, xzr, [x0] /// /// instead of: /// /// movi v0.2d, #0 /// str q0, [x0] /// static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // Avoid scalarizing zero splat stores for scalable vectors. if (VT.isScalableVector()) return SDValue(); // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); if (!(((NumVecElts == 2 || NumVecElts == 3) && VT.getVectorElementType().getSizeInBits() == 64) || ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && VT.getVectorElementType().getSizeInBits() == 32))) return SDValue(); if (StVal.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // If the zero constant has more than one use then the vector store could be // better since the constant mov will be amortized and stp q instructions // should be able to be formed. if (!StVal.hasOneUse()) return SDValue(); // If the store is truncating then it's going down to i16 or smaller, which // means it can be implemented in a single store anyway. if (St.isTruncatingStore()) return SDValue(); // If the immediate offset of the address operand is too large for the stp // instruction, then bail out. if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); if (Offset < -512 || Offset > 504) return SDValue(); } for (int I = 0; I < NumVecElts; ++I) { SDValue EltVal = StVal.getOperand(I); if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) return SDValue(); } // Use a CopyFromReg WZR/XZR here to prevent // DAGCombiner::MergeConsecutiveStores from undoing this transformation. SDLoc DL(&St); unsigned ZeroReg; EVT ZeroVT; if (VT.getVectorElementType().getSizeInBits() == 32) { ZeroReg = AArch64::WZR; ZeroVT = MVT::i32; } else { ZeroReg = AArch64::XZR; ZeroVT = MVT::i64; } SDValue SplatVal = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } /// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// value. The load store optimizer pass will merge them to store pair stores. /// This has better performance than a splat of the scalar followed by a split /// vector store. Even if the stores are not merged it is four stores vs a dup, /// followed by an ext.b and two stores. static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // Don't replace floating point stores, they possibly won't be transformed to // stp because of the store pair suppress pass. if (VT.isFloatingPoint()) return SDValue(); // We can express a splat as store pair(s) for 2 or 4 elements. unsigned NumVecElts = VT.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 2) return SDValue(); // If the store is truncating then it's going down to i16 or smaller, which // means it can be implemented in a single store anyway. if (St.isTruncatingStore()) return SDValue(); // Check that this is a splat. // Make sure that each of the relevant vector element locations are inserted // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); SDValue SplatVal; for (unsigned I = 0; I < NumVecElts; ++I) { // Check for insert vector elements. if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); // Check that same value is inserted at each vector element. if (I == 0) SplatVal = StVal.getOperand(1); else if (StVal.getOperand(1) != SplatVal) return SDValue(); // Check insert element index. ConstantSDNode *CIndex = dyn_cast(StVal.getOperand(2)); if (!CIndex) return SDValue(); uint64_t IndexVal = CIndex->getZExtValue(); if (IndexVal >= NumVecElts) return SDValue(); IndexNotInserted.reset(IndexVal); StVal = StVal.getOperand(0); } // Check that all vector element locations were inserted to. if (IndexNotInserted.any()) return SDValue(); return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { StoreSDNode *S = cast(N); if (S->isVolatile() || S->isIndexed()) return SDValue(); SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); if (!VT.isFixedLengthVector()) return SDValue(); // If we get a splat of zeros, convert this vector store to a store of // scalars. They will be merged into store pairs of xzr thereby removing one // instruction and one register. if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) return ReplacedZeroSplat; // FIXME: The logic for deciding if an unaligned store should be split should // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); // Don't split v2i64 vectors. Memcpy lowering produces those and splitting // those up regresses performance on micro-benchmarks and olden/bh. if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) return SDValue(); // Split unaligned 16B stores. They are terrible for performance. // Don't split stores with alignment of 1 or 2. Code that uses clang vector // extensions can use this to mark that it does not want splitting to happen // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of // eliminating alignment hazards is only 1 in 8 for alignment of 2. if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) || S->getAlign() <= Align(2)) return SDValue(); // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) return ReplacedSplat; SDLoc DL(S); // Split VT into two. EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); unsigned NumElts = HalfVT.getVectorNumElements(); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(NumElts, DL, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), S->getAlign(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, S->getPointerInfo(), S->getAlign(), S->getMemOperand()->getFlags()); } static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!"); // splice(pg, op1, undef) -> op1 if (N->getOperand(2).isUndef()) return N->getOperand(1); return SDValue(); } static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { assert((N->getOpcode() == AArch64ISD::UUNPKHI || N->getOpcode() == AArch64ISD::UUNPKLO) && "Unexpected Opcode!"); // uunpklo/hi undef -> undef if (N->getOperand(0).isUndef()) return DAG.getUNDEF(N->getValueType(0)); // If this is a masked load followed by an UUNPKLO, fold this into a masked // extending load. We can do this even if this is already a masked // {z,}extload. if (N->getOperand(0).getOpcode() == ISD::MLOAD && N->getOpcode() == AArch64ISD::UUNPKLO) { MaskedLoadSDNode *MLD = cast(N->getOperand(0)); SDValue Mask = MLD->getMask(); SDLoc DL(N); if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD && SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE && (MLD->getPassThru()->isUndef() || isZerosVector(MLD->getPassThru().getNode()))) { unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); unsigned PgPattern = Mask->getConstantOperandVal(0); EVT VT = N->getValueType(0); // Ensure we can double the size of the predicate pattern unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); if (NumElts && NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) { Mask = getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern); SDValue PassThru = DAG.getConstant(0, DL, VT); SDValue NewLoad = DAG.getMaskedLoad( VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask, PassThru, MLD->getMemoryVT(), MLD->getMemOperand(), MLD->getAddressingMode(), ISD::ZEXTLOAD); DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1)); return NewLoad; } } } return SDValue(); } static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); EVT ResVT = N->getValueType(0); // uzp1(x, undef) -> concat(truncate(x), undef) if (Op1.getOpcode() == ISD::UNDEF) { EVT BCVT = MVT::Other, HalfVT = MVT::Other; switch (ResVT.getSimpleVT().SimpleTy) { default: break; case MVT::v16i8: BCVT = MVT::v8i16; HalfVT = MVT::v8i8; break; case MVT::v8i16: BCVT = MVT::v4i32; HalfVT = MVT::v4i16; break; case MVT::v4i32: BCVT = MVT::v2i64; HalfVT = MVT::v2i32; break; } if (BCVT != MVT::Other) { SDValue BC = DAG.getBitcast(BCVT, Op0); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC); return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc, DAG.getUNDEF(HalfVT)); } } // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { SDValue X = Op0.getOperand(0).getOperand(0); return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); } } // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { SDValue Z = Op1.getOperand(0).getOperand(1); return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); } } // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y)) // Only implemented on little-endian subtargets. bool IsLittleEndian = DAG.getDataLayout().isLittleEndian(); // This optimization only works on little endian. if (!IsLittleEndian) return SDValue(); if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8) return SDValue(); auto getSourceOp = [](SDValue Operand) -> SDValue { const unsigned Opcode = Operand.getOpcode(); if (Opcode == ISD::TRUNCATE) return Operand->getOperand(0); if (Opcode == ISD::BITCAST && Operand->getOperand(0).getOpcode() == ISD::TRUNCATE) return Operand->getOperand(0)->getOperand(0); return SDValue(); }; SDValue SourceOp0 = getSourceOp(Op0); SDValue SourceOp1 = getSourceOp(Op1); if (!SourceOp0 || !SourceOp1) return SDValue(); if (SourceOp0.getValueType() != SourceOp1.getValueType() || !SourceOp0.getValueType().isSimple()) return SDValue(); EVT ResultTy; switch (SourceOp0.getSimpleValueType().SimpleTy) { case MVT::v2i64: ResultTy = MVT::v4i32; break; case MVT::v4i32: ResultTy = MVT::v8i16; break; case MVT::v8i16: ResultTy = MVT::v16i8; break; default: return SDValue(); } SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0); SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1); SDValue UzpResult = DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1); EVT BitcastResultTy; switch (ResVT.getSimpleVT().SimpleTy) { case MVT::v2i32: BitcastResultTy = MVT::v2i64; break; case MVT::v4i16: BitcastResultTy = MVT::v4i32; break; case MVT::v8i8: BitcastResultTy = MVT::v8i16; break; default: llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}"); } return DAG.getNode(ISD::TRUNCATE, DL, ResVT, DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult)); } static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { unsigned Opc = N->getOpcode(); assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) || (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) && "Invalid opcode."); const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO || Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO || Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO || Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO || Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO || Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO; SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Pg = N->getOperand(1); SDValue Base = N->getOperand(2); SDValue Offset = N->getOperand(3); SDValue Ty = N->getOperand(4); EVT ResVT = N->getValueType(0); const auto OffsetOpc = Offset.getOpcode(); const bool OffsetIsZExt = OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU; const bool OffsetIsSExt = OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU; // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible. if (!Extended && (OffsetIsSExt || OffsetIsZExt)) { SDValue ExtPg = Offset.getOperand(0); VTSDNode *ExtFrom = cast(Offset.getOperand(2).getNode()); EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType(); // If the predicate for the sign- or zero-extended offset is the // same as the predicate used for this load and the sign-/zero-extension // was from a 32-bits... if (ExtPg == Pg && ExtFromEVT == MVT::i32) { SDValue UnextendedOffset = Offset.getOperand(1); unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true); if (Signed) NewOpc = getSignExtendedGatherOpcode(NewOpc); return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other}, {Chain, Pg, Base, UnextendedOffset, Ty}); } } return SDValue(); } /// Optimize a vector shift instruction and its operand if shifted out /// bits are not used. static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == AArch64ISD::VASHR || N->getOpcode() == AArch64ISD::VLSHR); SDValue Op = N->getOperand(0); unsigned OpScalarSize = Op.getScalarValueSizeInBits(); unsigned ShiftImm = N->getConstantOperandVal(1); assert(OpScalarSize > ShiftImm && "Invalid shift imm"); // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits. if (N->getOpcode() == AArch64ISD::VASHR && Op.getOpcode() == AArch64ISD::VSHL && N->getOperand(1) == Op.getOperand(1)) if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm) return Op.getOperand(0); APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm); APInt DemandedMask = ~ShiftedOutBits; if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) { // sunpklo(sext(pred)) -> sext(extract_low_half(pred)) // This transform works in partnership with performSetCCPunpkCombine to // remove unnecessary transfer of predicates into standard registers and back if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND && N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() == MVT::i1) { SDValue CC = N->getOperand(0)->getOperand(0); auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext()); SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC, DAG.getVectorIdxConstant(0, SDLoc(N))); return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk); } return SDValue(); } /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!VT.is128BitVector() && !VT.is64BitVector()) return SDValue(); unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. if (LD->getOpcode() != ISD::LOAD) return SDValue(); // The vector lane must be a constant in the LD1LANE opcode. SDValue Lane; if (IsLaneOp) { Lane = N->getOperand(2); auto *LaneC = dyn_cast(Lane); if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); } LoadSDNode *LoadSDN = cast(LD); EVT MemVT = LoadSDN->getMemoryVT(); // Check if memory operand is the same type as the vector element. if (MemVT != VT.getVectorElementType()) return SDValue(); // Check if there are other uses. If so, do not combine as it will introduce // an extra load. for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. continue; if (*UI != N) return SDValue(); } // If there is one use and it can splat the value, prefer that operation. // TODO: This could be expanded to more operations if they reliably use the // index variants. if (N->hasOneUse()) { unsigned UseOpc = N->use_begin()->getOpcode(); if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA) return SDValue(); } SDValue Addr = LD->getOperand(1); SDValue Vector = N->getOperand(0); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = VT.getScalarSizeInBits() / 8; if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } // To avoid cycle construction make sure that neither the load nor the add // are predecessors to each other or the Vector. SmallPtrSet Visited; SmallVector Worklist; Visited.insert(Addr.getNode()); Worklist.push_back(User); Worklist.push_back(LD); Worklist.push_back(Vector.getNode()); if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || SDNode::hasPredecessorHelper(User, Visited, Worklist)) continue; SmallVector Ops; Ops.push_back(LD->getOperand(0)); // Chain if (IsLaneOp) { Ops.push_back(Vector); // The vector to be inserted Ops.push_back(Lane); // The lane to be inserted in the vector } Ops.push_back(Addr); Ops.push_back(Inc); EVT Tys[3] = { VT, MVT::i64, MVT::Other }; SDVTList SDTys = DAG.getVTList(Tys); unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT, LoadSDN->getMemOperand()); // Update the uses. SDValue NewResults[] = { SDValue(LD, 0), // The result of load SDValue(UpdN.getNode(), 2) // Chain }; DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register break; } return SDValue(); } /// Simplify ``Addr`` given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); return true; } return false; } static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) && "Expected STORE dag node in input!"); if (auto Store = dyn_cast(N)) { if (!Store->isTruncatingStore() || Store->isIndexed()) return SDValue(); SDValue Ext = Store->getValue(); auto ExtOpCode = Ext.getOpcode(); if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND && ExtOpCode != ISD::ANY_EXTEND) return SDValue(); SDValue Orig = Ext->getOperand(0); if (Store->getMemoryVT() != Orig.getValueType()) return SDValue(); return DAG.getStore(Store->getChain(), SDLoc(Store), Orig, Store->getBasePtr(), Store->getMemOperand()); } return SDValue(); } // Perform TBI simplification if supported by the target and try to break up // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit // load instructions can be selected. static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (Subtarget->supportsAddressTopByteIgnored()) performTBISimplification(N->getOperand(1), DCI, DAG); LoadSDNode *LD = cast(N); EVT MemVT = LD->getMemoryVT(); if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian()) return SDValue(N, 0); if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 || MemVT.getSizeInBits() % 256 == 0 || 256 % MemVT.getScalarSizeInBits() != 0) return SDValue(N, 0); SDLoc DL(LD); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); SDNodeFlags Flags = LD->getFlags(); SmallVector LoadOps; SmallVector LoadOpsChain; // Replace any non temporal load over 256-bit with a series of 256 bit loads // and a scalar/vector load less than 256. This way we can utilize 256-bit // loads and reduce the amount of load instructions generated. MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(), 256 / MemVT.getVectorElementType().getSizeInBits()); unsigned Num256Loads = MemVT.getSizeInBits() / 256; // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32. for (unsigned I = 0; I < Num256Loads; I++) { unsigned PtrOffset = I * 32; SDValue NewPtr = DAG.getMemBasePlusOffset( BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags); Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset); SDValue NewLoad = DAG.getLoad( NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo()); LoadOps.push_back(NewLoad); LoadOpsChain.push_back(SDValue(cast(NewLoad), 1)); } // Process remaining bits of the load operation. // This is done by creating an UNDEF vector to match the size of the // 256-bit loads and inserting the remaining load to it. We extract the // original load type at the end using EXTRACT_SUBVECTOR instruction. unsigned BitsRemaining = MemVT.getSizeInBits() % 256; unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8; MVT RemainingVT = MVT::getVectorVT( MemVT.getVectorElementType().getSimpleVT(), BitsRemaining / MemVT.getVectorElementType().getSizeInBits()); SDValue NewPtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags); Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset); SDValue RemainingLoad = DAG.getLoad(RemainingVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo()); SDValue UndefVector = DAG.getUNDEF(NewVT); SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL); SDValue ExtendedReminingLoad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, {UndefVector, RemainingLoad, InsertIdx}); LoadOps.push_back(ExtendedReminingLoad); LoadOpsChain.push_back(SDValue(cast(RemainingLoad), 1)); EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), LoadOps.size() * NewVT.getVectorNumElements()); SDValue ConcatVectors = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps); // Extract the original vector type size. SDValue ExtractSubVector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, {ConcatVectors, DAG.getVectorIdxConstant(0, DL)}); SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain); return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL); } static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) { EVT VecVT = Op.getValueType(); assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 && "Need boolean vector type."); if (Depth > 3) return MVT::INVALID_SIMPLE_VALUE_TYPE; // We can get the base type from a vector compare or truncate. if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE) return Op.getOperand(0).getValueType(); // If an operand is a bool vector, continue looking. EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE; for (SDValue Operand : Op->op_values()) { if (Operand.getValueType() != VecVT) continue; EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1); if (!BaseVT.isSimple()) BaseVT = OperandVT; else if (OperandVT != BaseVT) return MVT::INVALID_SIMPLE_VALUE_TYPE; } return BaseVT; } // When converting a vector to to store or use as a scalar // iN, we can use a trick that extracts the i^th bit from the i^th element and // then performs a vector add to get a scalar bitmask. This requires that each // element's bits are either all 1 or all 0. static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue ComparisonResult(N, 0); EVT VecVT = ComparisonResult.getValueType(); assert(VecVT.isVector() && "Must be a vector type"); unsigned NumElts = VecVT.getVectorNumElements(); if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return SDValue(); if (VecVT.getVectorElementType() != MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(VecVT)) return SDValue(); // If we can find the original types to work on instead of a vector of i1, // we can avoid extend/extract conversion instructions. if (VecVT.getVectorElementType() == MVT::i1) { VecVT = tryGetOriginalBoolVectorType(ComparisonResult); if (!VecVT.isSimple()) { unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts); } } VecVT = VecVT.changeVectorElementTypeToInteger(); // Large vectors don't map directly to this conversion, so to avoid too many // edge cases, we don't apply it here. The conversion will likely still be // applied later via multiple smaller vectors, whose results are concatenated. if (VecVT.getSizeInBits() > 128) return SDValue(); // Ensure that all elements' bits are either 0s or 1s. ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); SmallVector MaskConstants; if (VecVT == MVT::v16i8) { // v16i8 is a special case, as we need to split it into two halves and // combine, perform the mask+addition twice, and then combine them. for (unsigned Half = 0; Half < 2; ++Half) { for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); } } SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants); SDValue RepresentativeBits = DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask); EVT HalfVT = VecVT.getHalfNumVectorElementsVT(*DAG.getContext()); unsigned NumElementsInHalf = HalfVT.getVectorNumElements(); SDValue LowHalf = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, RepresentativeBits, DAG.getConstant(0, DL, MVT::i64)); SDValue HighHalf = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, RepresentativeBits, DAG.getConstant(NumElementsInHalf, DL, MVT::i64)); SDValue ReducedLowBits = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, LowHalf); SDValue ReducedHighBits = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, HighHalf); SDValue ShiftedHighBits = DAG.getNode(ISD::SHL, DL, MVT::i16, ReducedHighBits, DAG.getConstant(NumElementsInHalf, DL, MVT::i32)); return DAG.getNode(ISD::OR, DL, MVT::i16, ShiftedHighBits, ReducedLowBits); } // All other vector sizes. unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1); for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) { MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64)); } SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants); SDValue RepresentativeBits = DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask); EVT ResultVT = MVT::getIntegerVT(std::max( NumElts, VecVT.getVectorElementType().getSizeInBits())); return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits); } static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store) { if (!Store->isTruncatingStore()) return SDValue(); SDLoc DL(Store); SDValue VecOp = Store->getValue(); EVT VT = VecOp.getValueType(); EVT MemVT = Store->getMemoryVT(); if (!MemVT.isVector() || !VT.isVector() || MemVT.getVectorElementType() != MVT::i1) return SDValue(); // If we are storing a vector that we are currently building, let // `scalarizeVectorStore()` handle this more efficiently. if (VecOp.getOpcode() == ISD::BUILD_VECTOR) return SDValue(); VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp); SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG); if (!VectorBits) return SDValue(); EVT StoreVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits()); SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT); return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(), Store->getMemOperand()); } static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); EVT ValueVT = Value.getValueType(); auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; }; // If this is an FP_ROUND followed by a store, fold this into a truncating // store. We can do this even if this is already a truncstore. // We purposefully don't care about legality of the nodes here as we know // they can be split down into something legal. if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && Value.getNode()->hasOneUse() && ST->isUnindexed() && Subtarget->useSVEForFixedLengthVectors() && ValueVT.isFixedLengthVector() && ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() && hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType())) return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && performTBISimplification(N->getOperand(2), DCI, DAG)) return SDValue(N, 0); if (SDValue Store = foldTruncStoreOfExt(DAG, N)) return Store; if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST)) return Store; return SDValue(); } static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { MaskedStoreSDNode *MST = cast(N); SDValue Value = MST->getValue(); SDValue Mask = MST->getMask(); SDLoc DL(N); // If this is a UZP1 followed by a masked store, fold this into a masked // truncating store. We can do this even if this is already a masked // truncstore. if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() && MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE && Value.getValueType().isInteger()) { Value = Value.getOperand(0); if (Value.getOpcode() == ISD::BITCAST) { EVT HalfVT = Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); EVT InVT = Value.getOperand(0).getValueType(); if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) { unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); unsigned PgPattern = Mask->getConstantOperandVal(0); // Ensure we can double the size of the predicate pattern unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <= MinSVESize) { Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1), PgPattern); return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0), MST->getBasePtr(), MST->getOffset(), Mask, MST->getMemoryVT(), MST->getMemOperand(), MST->getAddressingMode(), /*IsTruncating=*/true); } } } } return SDValue(); } /// \return true if part of the index was folded into the Base. static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG) { // This function assumes a vector of i64 indices. EVT IndexVT = Index.getValueType(); if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64) return false; // Simplify: // BasePtr = Ptr // Index = X + splat(Offset) // -> // BasePtr = Ptr + Offset * scale. // Index = X if (Index.getOpcode() == ISD::ADD) { if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) { Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); Index = Index.getOperand(0); return true; } } // Simplify: // BasePtr = Ptr // Index = (X + splat(Offset)) << splat(Shift) // -> // BasePtr = Ptr + (Offset << Shift) * scale) // Index = X << splat(shift) if (Index.getOpcode() == ISD::SHL && Index.getOperand(0).getOpcode() == ISD::ADD) { SDValue Add = Index.getOperand(0); SDValue ShiftOp = Index.getOperand(1); SDValue OffsetOp = Add.getOperand(1); if (auto Shift = DAG.getSplatValue(ShiftOp)) if (auto Offset = DAG.getSplatValue(OffsetOp)) { Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift); Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(), Add.getOperand(0), ShiftOp); return true; } } return false; } // Analyse the specified address returning true if a more optimal addressing // mode is available. When returning true all parameters are updated to reflect // their recommended values. static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) { // Try to iteratively fold parts of the index into the base pointer to // simplify the index as much as possible. bool Changed = false; while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG)) Changed = true; // Only consider element types that are pointer sized as smaller types can // be easily promoted. EVT IndexVT = Index.getValueType(); if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64) return Changed; // Can indices be trivially shrunk? EVT DataVT = N->getOperand(1).getValueType(); // Don't attempt to shrink the index for fixed vectors of 64 bit data since it // will later be re-extended to 64 bits in legalization if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64) return Changed; if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) { EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index); return true; } // Match: // Index = step(const) int64_t Stride = 0; if (Index.getOpcode() == ISD::STEP_VECTOR) { Stride = cast(Index.getOperand(0))->getSExtValue(); } // Match: // Index = step(const) << shift(const) else if (Index.getOpcode() == ISD::SHL && Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) { SDValue RHS = Index.getOperand(1); if (auto *Shift = dyn_cast_or_null(DAG.getSplatValue(RHS))) { int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1); Stride = Step << Shift->getZExtValue(); } } // Return early because no supported pattern is found. if (Stride == 0) return Changed; if (Stride < std::numeric_limits::min() || Stride > std::numeric_limits::max()) return Changed; const auto &Subtarget = DAG.getSubtarget(); unsigned MaxVScale = Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock; int64_t LastElementOffset = IndexVT.getVectorMinNumElements() * Stride * MaxVScale; if (LastElementOffset < std::numeric_limits::min() || LastElementOffset > std::numeric_limits::max()) return Changed; EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); // Stride does not scale explicitly by 'Scale', because it happens in // the gather/scatter addressing mode. Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride)); return true; } static SDValue performMaskedGatherScatterCombine( SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { MaskedGatherScatterSDNode *MGS = cast(N); assert(MGS && "Can only combine gather load or scatter store nodes"); if (!DCI.isBeforeLegalize()) return SDValue(); SDLoc DL(MGS); SDValue Chain = MGS->getChain(); SDValue Scale = MGS->getScale(); SDValue Index = MGS->getIndex(); SDValue Mask = MGS->getMask(); SDValue BasePtr = MGS->getBasePtr(); ISD::MemIndexType IndexType = MGS->getIndexType(); if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG)) return SDValue(); // Here we catch such cases early and change MGATHER's IndexType to allow // the use of an Index that's more legalisation friendly. if (auto *MGT = dyn_cast(MGS)) { SDValue PassThru = MGT->getPassThru(); SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; return DAG.getMaskedGather( DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL, Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType()); } auto *MSC = cast(MGS); SDValue Data = MSC->getValue(); SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale}; return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL, Ops, MSC->getMemOperand(), IndexType, MSC->isTruncatingStore()); } /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); unsigned AddrOpIdx = N->getNumOperands() - 1; SDValue Addr = N->getOperand(AddrOpIdx); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. SmallPtrSet Visited; SmallVector Worklist; Visited.insert(Addr.getNode()); Worklist.push_back(N); Worklist.push_back(User); if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || SDNode::hasPredecessorHelper(User, Visited, Worklist)) continue; // Find the new opcode for the updating load/store. bool IsStore = false; bool IsLaneOp = false; bool IsDupOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; NumVecs = 2; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; NumVecs = 3; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; NumVecs = 4; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; NumVecs = 2; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; NumVecs = 3; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; NumVecs = 4; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; NumVecs = 2; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; NumVecs = 3; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; NumVecs = 4; IsStore = true; IsLaneOp = true; break; } EVT VecTy; if (IsStore) VecTy = N->getOperand(2).getValueType(); else VecTy = N->getValueType(0); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (IsLaneOp || IsDupOp) NumBytes /= VecTy.getVectorNumElements(); if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } SmallVector Ops; Ops.push_back(N->getOperand(0)); // Incoming chain // Load lane and store have vector list as input. if (IsLaneOp || IsStore) for (unsigned i = 2; i < AddrOpIdx; ++i) Ops.push_back(N->getOperand(i)); Ops.push_back(Addr); // Base register Ops.push_back(Inc); // Return Types. EVT Tys[6]; unsigned NumResultVecs = (IsStore ? 0 : NumVecs); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = VecTy; Tys[n++] = MVT::i64; // Type of write back register Tys[n] = MVT::Other; // Type of the chain SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); MemIntrinsicSDNode *MemInt = cast(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, MemInt->getMemoryVT(), MemInt->getMemOperand()); // Update the uses. std::vector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) { NewResults.push_back(SDValue(UpdN.getNode(), i)); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } // Checks to see if the value is the prescribed width and returns information // about its extension mode. static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { ExtType = ISD::NON_EXTLOAD; switch(V.getNode()->getOpcode()) { default: return false; case ISD::LOAD: { LoadSDNode *LoadNode = cast(V.getNode()); if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { ExtType = LoadNode->getExtensionType(); return true; } return false; } case ISD::AssertSext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::SEXTLOAD; return true; } return false; } case ISD::AssertZext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::ZEXTLOAD; return true; } return false; } case ISD::Constant: case ISD::TargetConstant: { return std::abs(cast(V.getNode())->getSExtValue()) < 1LL << (width - 1); } } return true; } // This function does a whole lot of voodoo to determine if the tests are // equivalent without and with a mask. Essentially what happens is that given a // DAG resembling: // // +-------------+ +-------------+ +-------------+ +-------------+ // | Input | | AddConstant | | CompConstant| | CC | // +-------------+ +-------------+ +-------------+ +-------------+ // | | | | // V V | +----------+ // +-------------+ +----+ | | // | ADD | |0xff| | | // +-------------+ +----+ | | // | | | | // V V | | // +-------------+ | | // | AND | | | // +-------------+ | | // | | | // +-----+ | | // | | | // V V V // +-------------+ // | CMP | // +-------------+ // // The AND node may be safely removed for some combinations of inputs. In // particular we need to take into account the extension type of the Input, // the exact values of AddConstant, CompConstant, and CC, along with the nominal // width of the input (this can work for any width inputs, the above graph is // specific to 8 bits. // // The specific equations were worked out by generating output tables for each // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The // problem was simplified by working with 4 bit inputs, which means we only // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 // patterns present in both extensions (0,7). For every distinct set of // AddConstant and CompConstants bit patterns we can consider the masked and // unmasked versions to be equivalent if the result of this function is true for // all 16 distinct bit patterns of for the current extension type of Input (w0). // // sub w8, w0, w1 // and w10, w8, #0x0f // cmp w8, w2 // cset w9, AArch64CC // cmp w10, w2 // cset w11, AArch64CC // cmp w9, w11 // cset w0, eq // ret // // Since the above function shows when the outputs are equivalent it defines // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and // would be expensive to run during compiles. The equations below were written // in a test harness that confirmed they gave equivalent outputs to the above // for all inputs function, so they can be used determine if the removal is // legal instead. // // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. int MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer // width. Provided we are careful and make sure our equations are valid over // the whole range we can just adjust the input and avoid writing equations // for sign extended inputs. if (ExtType == ISD::SEXTLOAD) AddConstant -= (1 << (width-1)); switch(CC) { case AArch64CC::LE: case AArch64CC::GT: if ((AddConstant == 0) || (CompConstant == MaxUInt - 1 && AddConstant < 0) || (AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) return true; break; case AArch64CC::LT: case AArch64CC::GE: if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) return true; break; case AArch64CC::HI: case AArch64CC::LS: if ((AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant >= -1 && CompConstant < AddConstant + MaxUInt)) return true; break; case AArch64CC::PL: case AArch64CC::MI: if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) || (AddConstant < 0 && CompConstant <= AddConstant)) return true; break; case AArch64CC::LO: case AArch64CC::HS: if ((AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant >= 0 && CompConstant <= AddConstant + MaxUInt)) return true; break; case AArch64CC::EQ: case AArch64CC::NE: if ((AddConstant > 0 && CompConstant < 0) || (AddConstant < 0 && CompConstant >= 0 && CompConstant < AddConstant + MaxUInt) || (AddConstant >= 0 && CompConstant >= 0 && CompConstant >= AddConstant) || (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) return true; break; case AArch64CC::VS: case AArch64CC::VC: case AArch64CC::AL: case AArch64CC::NV: return true; case AArch64CC::Invalid: break; } return false; } // (X & C) >u Mask --> (X & (C & (~Mask)) != 0 // (X & C) (X & (C & ~(Pow2-1)) == 0 static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC) { ConstantSDNode *SubsC = dyn_cast(SubsNode->getOperand(1)); if (!SubsC) return SDValue(); APInt SubsAP = SubsC->getAPIntValue(); if (CC == AArch64CC::HI) { if (!SubsAP.isMask()) return SDValue(); } else if (CC == AArch64CC::LO) { if (!SubsAP.isPowerOf2()) return SDValue(); } else return SDValue(); ConstantSDNode *AndC = dyn_cast(AndNode->getOperand(1)); if (!AndC) return SDValue(); APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1); SDLoc DL(N); APInt AndSMask = (~MaskAP) & AndC->getAPIntValue(); SDValue ANDS = DAG.getNode( AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0), DAG.getConstant(AndSMask, DL, SubsC->getValueType(0))); SDValue AArch64_CC = DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL, N->getOperand(CCIndex)->getValueType(0)); // For now, only performCSELCombine and performBRCONDCombine call this // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4 // operands. So just init the ops direct to simplify the code. If we have some // other case with different CCIndex, CmpIndex, we need to use for loop to // rewrite the code here. // TODO: Do we need to assert number of operand is 4 here? assert((CCIndex == 2 && CmpIndex == 3) && "Expected CCIndex to be 2 and CmpIndex to be 3."); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC, ANDS.getValue(1)}; return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops); } static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex) { unsigned CC = cast(N->getOperand(CCIndex))->getSExtValue(); SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); unsigned CondOpcode = SubsNode->getOpcode(); if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0)) return SDValue(); // There is a SUBS feeding this condition. Is it fed by a mask we can // use? SDNode *AndNode = SubsNode->getOperand(0).getNode(); unsigned MaskBits = 0; if (AndNode->getOpcode() != ISD::AND) return SDValue(); if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex, CmpIndex, CC)) return Val; if (ConstantSDNode *CN = dyn_cast(AndNode->getOperand(1))) { uint32_t CNV = CN->getZExtValue(); if (CNV == 255) MaskBits = 8; else if (CNV == 65535) MaskBits = 16; } if (!MaskBits) return SDValue(); SDValue AddValue = AndNode->getOperand(0); if (AddValue.getOpcode() != ISD::ADD) return SDValue(); // The basic dag structure is correct, grab the inputs and validate them. SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); SDValue SubsInputValue = SubsNode->getOperand(1); // The mask is present and the provenance of all the values is a smaller type, // lets see if the mask is superfluous. if (!isa(AddInputValue2.getNode()) || !isa(SubsInputValue.getNode())) return SDValue(); ISD::LoadExtType ExtType; if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || !checkValueWidth(AddInputValue2, MaskBits, ExtType) || !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) return SDValue(); if(!isEquivalentMaskless(CC, MaskBits, ExtType, cast(AddInputValue2.getNode())->getSExtValue(), cast(SubsInputValue.getNode())->getSExtValue())) return SDValue(); // The AND is not necessary, remove it. SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), SubsNode->getValueType(1)); SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); return SDValue(N, 0); } // Optimize compare with zero and branch. static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions // will not be produced, as they are conditional branch instructions that do // not set flags. if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) return SDValue(); if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue CCVal = N->getOperand(2); SDValue Cmp = N->getOperand(3); assert(isa(CCVal) && "Expected a ConstantSDNode here!"); unsigned CC = cast(CCVal)->getZExtValue(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) return SDValue(); unsigned CmpOpc = Cmp.getOpcode(); if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) return SDValue(); // Only attempt folding if there is only one use of the flag and no use of the // value. if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) return SDValue(); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); assert(LHS.getValueType() == RHS.getValueType() && "Expected the value type to be the same for both operands!"); if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); if (isNullConstant(LHS)) std::swap(LHS, RHS); if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || LHS.getOpcode() == ISD::SRL) return SDValue(); // Fold the compare into the branch instruction. SDValue BR; if (CC == AArch64CC::EQ) BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); else BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, BR, false); return SDValue(); } static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) { unsigned CC = N->getConstantOperandVal(2); SDValue SUBS = N->getOperand(3); SDValue Zero, CTTZ; if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) { Zero = N->getOperand(0); CTTZ = N->getOperand(1); } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) { Zero = N->getOperand(1); CTTZ = N->getOperand(0); } else return SDValue(); if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) || (CTTZ.getOpcode() == ISD::TRUNCATE && CTTZ.getOperand(0).getOpcode() != ISD::CTTZ)) return SDValue(); assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) && "Illegal type in CTTZ folding"); if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1))) return SDValue(); SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE ? CTTZ.getOperand(0).getOperand(0) : CTTZ.getOperand(0); if (X != SUBS.getOperand(0)) return SDValue(); unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE ? CTTZ.getOperand(0).getValueSizeInBits() : CTTZ.getValueSizeInBits(); SDValue BitWidthMinusOne = DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType()); return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ, BitWidthMinusOne); } // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond) // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond) // Where x and y are constants and x != y // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond) // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond) // Where x and y are constants and x != y static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) { SDValue L = Op->getOperand(0); SDValue R = Op->getOperand(1); AArch64CC::CondCode OpCC = static_cast(Op->getConstantOperandVal(2)); SDValue OpCmp = Op->getOperand(3); if (!isCMP(OpCmp)) return SDValue(); SDValue CmpLHS = OpCmp.getOperand(0); SDValue CmpRHS = OpCmp.getOperand(1); if (CmpRHS.getOpcode() == AArch64ISD::CSEL) std::swap(CmpLHS, CmpRHS); else if (CmpLHS.getOpcode() != AArch64ISD::CSEL) return SDValue(); SDValue X = CmpLHS->getOperand(0); SDValue Y = CmpLHS->getOperand(1); if (!isa(X) || !isa(Y) || X == Y) { return SDValue(); } // If one of the constant is opaque constant, x,y sdnode is still different // but the real value maybe the same. So check APInt here to make sure the // code is correct. ConstantSDNode *CX = cast(X); ConstantSDNode *CY = cast(Y); if (CX->getAPIntValue() == CY->getAPIntValue()) return SDValue(); AArch64CC::CondCode CC = static_cast(CmpLHS->getConstantOperandVal(2)); SDValue Cond = CmpLHS->getOperand(3); if (CmpRHS == Y) CC = AArch64CC::getInvertedCondCode(CC); else if (CmpRHS != X) return SDValue(); if (OpCC == AArch64CC::NE) CC = AArch64CC::getInvertedCondCode(CC); else if (OpCC != AArch64CC::EQ) return SDValue(); SDLoc DL(Op); EVT VT = Op->getValueType(0); SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond); } // Optimize CSEL instructions static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // CSEL x, x, cc -> x if (N->getOperand(0) == N->getOperand(1)) return N->getOperand(0); if (SDValue R = foldCSELOfCSEL(N, DAG)) return R; // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1 if (SDValue Folded = foldCSELofCTTZ(N, DAG)) return Folded; return performCONDCombine(N, DCI, DAG, 2, 3); } // Try to re-use an already extended operand of a vector SetCC feeding a // extended select. Doing so avoids requiring another full extension of the // SET_CC result when lowering the select. static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) { EVT Op0MVT = Op->getOperand(0).getValueType(); if (!Op0MVT.isVector() || Op->use_empty()) return SDValue(); // Make sure that all uses of Op are VSELECTs with result matching types where // the result type has a larger element type than the SetCC operand. SDNode *FirstUse = *Op->use_begin(); if (FirstUse->getOpcode() != ISD::VSELECT) return SDValue(); EVT UseMVT = FirstUse->getValueType(0); if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits()) return SDValue(); if (any_of(Op->uses(), [&UseMVT](const SDNode *N) { return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT; })) return SDValue(); APInt V; if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V)) return SDValue(); SDLoc DL(Op); SDValue Op0ExtV; SDValue Op1ExtV; ISD::CondCode CC = cast(Op->getOperand(2))->get(); // Check if the first operand of the SET_CC is already extended. If it is, // split the SET_CC and re-use the extended version of the operand. SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT), Op->getOperand(0)); SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT), Op->getOperand(0)); if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { Op0ExtV = SDValue(Op0SExt, 0); Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1)); } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { Op0ExtV = SDValue(Op0ZExt, 0); Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1)); } else return SDValue(); return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1), Op0ExtV, Op1ExtV, Op->getOperand(2)); } static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDValue Vec = N->getOperand(0); if (DCI.isBeforeLegalize() && Vec.getValueType().getVectorElementType() == MVT::i1 && Vec.getValueType().isFixedLengthVector() && Vec.getValueType().isPow2VectorType()) { SDLoc DL(N); return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL, DAG); } return SDValue(); } static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); ISD::CondCode Cond = cast(N->getOperand(2))->get(); SDLoc DL(N); EVT VT = N->getValueType(0); if (SDValue V = tryToWidenSetCCOperands(N, DAG)) return V; // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X if (Cond == ISD::SETNE && isOneConstant(RHS) && LHS->getOpcode() == AArch64ISD::CSEL && isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) && LHS->hasOneUse()) { // Invert CSEL's condition. auto *OpCC = cast(LHS.getOperand(2)); auto OldCond = static_cast(OpCC->getZExtValue()); auto NewCond = getInvertedCondCode(OldCond); // csel 0, 1, !cond, X SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), LHS.getOperand(3)); return DAG.getZExtOrTrunc(CSEL, DL, VT); } // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne if (Cond == ISD::SETNE && isNullConstant(RHS) && LHS->getOpcode() == ISD::SRL && isa(LHS->getOperand(1)) && LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() && LHS->hasOneUse()) { EVT TstVT = LHS->getValueType(0); if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) { // this pattern will get better opt in emitComparison uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1); SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0), DAG.getConstant(TstImm, DL, TstVT)); return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2)); } } // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne) // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne) // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne) // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne) if (DCI.isBeforeLegalize() && VT.isScalarInteger() && (Cond == ISD::SETEQ || Cond == ISD::SETNE) && (isNullConstant(RHS) || isAllOnesConstant(RHS)) && LHS->getOpcode() == ISD::BITCAST) { EVT ToVT = LHS->getValueType(0); EVT FromVT = LHS->getOperand(0).getValueType(); if (FromVT.isFixedLengthVector() && FromVT.getVectorElementType() == MVT::i1) { bool IsNull = isNullConstant(RHS); LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND, DL, MVT::i1, LHS->getOperand(0)); LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT, LHS); return DAG.getSetCC(DL, VT, LHS, RHS, Cond); } } // Try to perform the memcmp when the result is tested for [in]equality with 0 if (SDValue V = performOrXorChainCombine(N, DAG)) return V; return SDValue(); } // Replace a flag-setting operator (eg ANDS) with the generic version // (eg AND) if the flag is unused. static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode) { SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); // If the flag result isn't used, convert back to a generic opcode. if (!N->hasAnyUseOfValue(1)) { SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops()); return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)}, DL); } // Combine identical generic nodes into this node, re-using the result. if (SDNode *Generic = DCI.DAG.getNodeIfExists( GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS})) DCI.CombineTo(Generic, SDValue(N, 0)); return SDValue(); } static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) { // setcc_merge_zero pred // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne // => extract_subvector (inner setcc_merge_zero) SDValue Pred = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); ISD::CondCode Cond = cast(N->getOperand(3))->get(); if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) || LHS->getOpcode() != ISD::SIGN_EXTEND) return SDValue(); SDValue Extract = LHS->getOperand(0); if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR || Extract->getValueType(0) != N->getValueType(0) || Extract->getConstantOperandVal(1) != 0) return SDValue(); SDValue InnerSetCC = Extract->getOperand(0); if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO) return SDValue(); // By this point we've effectively got // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive // lanes are already zero then the trunc(sext()) sequence is redundant and we // can operate on A directly. SDValue InnerPred = InnerSetCC.getOperand(0); if (Pred.getOpcode() == AArch64ISD::PTRUE && InnerPred.getOpcode() == AArch64ISD::PTRUE && Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) && Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 && Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256) return Extract; return SDValue(); } static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && "Unexpected opcode!"); SelectionDAG &DAG = DCI.DAG; SDValue Pred = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); ISD::CondCode Cond = cast(N->getOperand(3))->get(); if (SDValue V = performSetCCPunpkCombine(N, DAG)) return V; if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && LHS->getOpcode() == ISD::SIGN_EXTEND && LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) { // setcc_merge_zero( // pred, extend(setcc_merge_zero(pred, ...)), != splat(0)) // => setcc_merge_zero(pred, ...) if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && LHS->getOperand(0)->getOperand(0) == Pred) return LHS->getOperand(0); // setcc_merge_zero( // all_active, extend(nxvNi1 ...), != splat(0)) // -> nxvNi1 ... if (isAllActivePredicate(DAG, Pred)) return LHS->getOperand(0); // setcc_merge_zero( // pred, extend(nxvNi1 ...), != splat(0)) // -> nxvNi1 and(pred, ...) if (DCI.isAfterLegalizeDAG()) // Do this after legalization to allow more folds on setcc_merge_zero // to be recognized. return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), LHS->getOperand(0), Pred); } return SDValue(); } // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test // as well as whether the test should be inverted. This code is required to // catch these cases (as opposed to standard dag combines) because // AArch64ISD::TBZ is matched during legalization. static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG) { if (!Op->hasOneUse()) return Op; // We don't handle undef/constant-fold cases below, as they should have // already been taken care of (e.g. and of 0, test of undefined shifted bits, // etc.) // (tbz (trunc x), b) -> (tbz x, b) // This case is just here to enable more of the below cases to be caught. if (Op->getOpcode() == ISD::TRUNCATE && Bit < Op->getValueType(0).getSizeInBits()) { return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. if (Op->getOpcode() == ISD::ANY_EXTEND && Bit < Op->getOperand(0).getValueSizeInBits()) { return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } if (Op->getNumOperands() != 2) return Op; auto *C = dyn_cast(Op->getOperand(1)); if (!C) return Op; switch (Op->getOpcode()) { default: return Op; // (tbz (and x, m), b) -> (tbz x, b) case ISD::AND: if ((C->getZExtValue() >> Bit) & 1) return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); return Op; // (tbz (shl x, c), b) -> (tbz x, b-c) case ISD::SHL: if (C->getZExtValue() <= Bit && (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit - C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x case ISD::SRA: Bit = Bit + C->getZExtValue(); if (Bit >= Op->getValueType(0).getSizeInBits()) Bit = Op->getValueType(0).getSizeInBits() - 1; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); // (tbz (srl x, c), b) -> (tbz x, b+c) case ISD::SRL: if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit + C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (xor x, -1), b) -> (tbnz x, b) case ISD::XOR: if ((C->getZExtValue() >> Bit) & 1) Invert = !Invert; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } } // Optimize test single bit zero/non-zero and branch. static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { unsigned Bit = cast(N->getOperand(2))->getZExtValue(); bool Invert = false; SDValue TestSrc = N->getOperand(1); SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); if (TestSrc == NewTestSrc) return SDValue(); unsigned NewOpc = N->getOpcode(); if (Invert) { if (NewOpc == AArch64ISD::TBZ) NewOpc = AArch64ISD::TBNZ; else { assert(NewOpc == AArch64ISD::TBNZ); NewOpc = AArch64ISD::TBZ; } } SDLoc DL(N); return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); } // Swap vselect operands where it may allow a predicated operation to achieve // the `sel`. // // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b))) // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a)) static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) { auto SelectA = N->getOperand(1); auto SelectB = N->getOperand(2); auto NTy = N->getValueType(0); if (!NTy.isScalableVector()) return SDValue(); SDValue SetCC = N->getOperand(0); if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse()) return SDValue(); switch (SelectB.getOpcode()) { default: return SDValue(); case ISD::FMUL: case ISD::FSUB: case ISD::FADD: break; } if (SelectA != SelectB.getOperand(0)) return SDValue(); ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); ISD::CondCode InverseCC = ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType()); auto InverseSetCC = DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0), SetCC.getOperand(1), InverseCC); return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy, {InverseSetCC, SelectB, SelectA}); } // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine // such VSELECT. static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { if (auto SwapResult = trySwapVSelectOperands(N, DAG)) return SwapResult; SDValue N0 = N->getOperand(0); EVT CCVT = N0.getValueType(); if (isAllActivePredicate(DAG, N0)) return N->getOperand(1); if (isAllInactivePredicate(N0)) return N->getOperand(2); // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform // into (OR (ASR lhs, N-1), 1), which requires less instructions for the // supported types. SDValue SetCC = N->getOperand(0); if (SetCC.getOpcode() == ISD::SETCC && SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) { SDValue CmpLHS = SetCC.getOperand(0); EVT VT = CmpLHS.getValueType(); SDNode *CmpRHS = SetCC.getOperand(1).getNode(); SDNode *SplatLHS = N->getOperand(1).getNode(); SDNode *SplatRHS = N->getOperand(2).getNode(); APInt SplatLHSVal; if (CmpLHS.getValueType() == N->getOperand(1).getValueType() && VT.isSimple() && is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}), VT.getSimpleVT().SimpleTy) && ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) && SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) && ISD::isConstantSplatVectorAllOnes(SplatRHS)) { unsigned NumElts = VT.getVectorNumElements(); SmallVector Ops( NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N), VT.getScalarType())); SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops); auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val); auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1)); return Or; } } if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorElementCount() != ElementCount::getFixed(1) || CCVT.getVectorElementType() != MVT::i1) return SDValue(); EVT ResVT = N->getValueType(0); EVT CmpVT = N0.getOperand(0).getValueType(); // Only combine when the result type is of the same size as the compared // operands. if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) return SDValue(); SDValue IfTrue = N->getOperand(1); SDValue IfFalse = N->getOperand(2); SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, IfTrue, IfFalse); } /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with /// the compare-mask instructions rather than going via NZCV, even if LHS and /// RHS are really scalar. This replaces any scalar setcc in the above pattern /// with a vector one followed by a DUP shuffle on the result. static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); if (N0.getOpcode() != ISD::SETCC) return SDValue(); if (ResVT.isScalableVT()) return SDValue(); // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered // scalar SetCCResultType. We also don't expect vectors, because we assume // that selects fed by vector SETCCs are canonicalized to VSELECT. assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && "Scalar-SETCC feeding SELECT has unexpected result type!"); // If NumMaskElts == 0, the comparison is larger than select result. The // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. EVT SrcVT = N0.getOperand(0).getValueType(); // Don't try to do this optimization when the setcc itself has i1 operands. // There are no legal vectors of i1, so this would be pointless. if (SrcVT == MVT::i1) return SDValue(); int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); // Also bail out if the vector CCVT isn't the same size as ResVT. // This can happen if the SETCC operand size doesn't divide the ResVT size // (e.g., f64 vs v3f32). if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) return SDValue(); // Make sure we didn't create illegal types, if we're not supposed to. assert(DCI.isBeforeLegalize() || DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); // First perform a vector comparison, where lane 0 is the one we're interested // in. SDLoc DL(N0); SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); SDValue RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); // Now duplicate the comparison mask we want across all other lanes. SmallVector DUPMask(CCVT.getVectorNumElements(), 0); SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); SmallVector Ops(N->ops()); if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), DCI.DAG.getVTList(LVT), Ops)) { SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); } } if (N->getOpcode() == AArch64ISD::DUP) return performPostLD1Combine(N, DCI, false); return SDValue(); } /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) return N->getOperand(0); return SDValue(); } // If all users of the globaladdr are of the form (globaladdr + constant), find // the smallest constant, fold it into the globaladdr's offset and rewrite the // globaladdr as (globaladdr + constant) - constant. static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM) { auto *GN = cast(N); if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != AArch64II::MO_NO_FLAG) return SDValue(); uint64_t MinOffset = -1ull; for (SDNode *N : GN->uses()) { if (N->getOpcode() != ISD::ADD) return SDValue(); auto *C = dyn_cast(N->getOperand(0)); if (!C) C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); MinOffset = std::min(MinOffset, C->getZExtValue()); } uint64_t Offset = MinOffset + GN->getOffset(); // Require that the new offset is larger than the existing one. Otherwise, we // can end up oscillating between two possible DAGs, for example, // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). if (Offset <= uint64_t(GN->getOffset())) return SDValue(); // Check whether folding this offset is legal. It must not go out of bounds of // the referenced object to avoid violating the code model, and must be // smaller than 2^20 because this is the largest offset expressible in all // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF // stores an immediate signed 21 bit offset.) // // This check also prevents us from folding negative offsets, which will end // up being treated in the same way as large positive ones. They could also // cause code model violations, and aren't really common enough to matter. if (Offset >= (1 << 20)) return SDValue(); const GlobalValue *GV = GN->getGlobal(); Type *T = GV->getValueType(); if (!T->isSized() || Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) return SDValue(); SDLoc DL(GN); SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, DAG.getConstant(MinOffset, DL, MVT::i64)); } static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { SDValue BR = N->getOperand(0); if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE || !BR.getValueType().isScalarInteger()) return SDValue(); SDLoc DL(N); return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0)); } // Turns the vector of indices into a vector of byte offstes by scaling Offset // by (BitWidth / 8). static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth) { assert(Offset.getValueType().isScalableVector() && "This method is only for scalable vectors of offsets"); SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); } /// Check if the value of \p OffsetInBytes can be used as an immediate for /// the gather load/prefetch and scatter store instructions with vector base and /// immediate offset addressing mode: /// /// [.[S|D]{, #}] /// /// where = sizeof() * k, for k = 0, 1, ..., 31. inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes) { // The immediate is not a multiple of the scalar size. if (OffsetInBytes % ScalarSizeInBytes) return false; // The immediate is out of range. if (OffsetInBytes / ScalarSizeInBytes > 31) return false; return true; } /// Check if the value of \p Offset represents a valid immediate for the SVE /// gather load/prefetch and scatter store instructiona with vector base and /// immediate offset addressing mode: /// /// [.[S|D]{, #}] /// /// where = sizeof() * k, for k = 0, 1, ..., 31. static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, unsigned ScalarSizeInBytes) { ConstantSDNode *OffsetConst = dyn_cast(Offset.getNode()); return OffsetConst && isValidImmForSVEVecImmAddrMode( OffsetConst->getZExtValue(), ScalarSizeInBytes); } static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { const SDValue Src = N->getOperand(2); const EVT SrcVT = Src->getValueType(0); assert(SrcVT.isScalableVector() && "Scatter stores are only possible for SVE vectors"); SDLoc DL(N); MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); // Make sure that source data will fit into an SVE register if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) return SDValue(); // For FPs, ACLE only supports _packed_ single and double precision types. if (SrcElVT.isFloatingPoint()) if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) return SDValue(); // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) SDValue Base = N->getOperand(4); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); // For "scalar + vector of indices", just scale the indices. This only // applies to non-temporal scatters because there's no instruction that takes // indicies. if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); Opcode = AArch64ISD::SSTNT1_PRED; } // In the case of non-temporal gather loads there's only one SVE instruction // per data-size: "scalar + vector", i.e. // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] // Since we do have intrinsics that allow the arguments to be in a different // order, we may need to swap them to match the spec. if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector()) std::swap(Base, Offset); // SST1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], // where #SizeInBytes is the size in bytes of the stored items. For // immediates outside that range and non-immediate scalar offsets use SST1 or // SST1_UXTW instead. if (Opcode == AArch64ISD::SST1_IMM_PRED) { if (!isValidImmForSVEVecImmAddrMode(Offset, SrcVT.getScalarSizeInBits() / 8)) { if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) Opcode = AArch64ISD::SST1_UXTW_PRED; else Opcode = AArch64ISD::SST1_PRED; std::swap(Base, Offset); } } auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); // Some scatter store variants allow unpacked offsets, but only as nxv2i32 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to // nxv2i64. Legalize accordingly. if (!OnlyPackedOffsets && Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); if (!TLI.isTypeLegal(Offset.getValueType())) return SDValue(); // Source value type that is representable in hardware EVT HwSrcVt = getSVEContainerType(SrcVT); // Keep the original type of the input data to store - this is needed to be // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For // FP values we want the integer equivalent, so just use HwSrcVt. SDValue InputVT = DAG.getValueType(SrcVT); if (SrcVT.isFloatingPoint()) InputVT = DAG.getValueType(HwSrcVt); SDVTList VTs = DAG.getVTList(MVT::Other); SDValue SrcNew; if (Src.getValueType().isFloatingPoint()) SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); else SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); SDValue Ops[] = {N->getOperand(0), // Chain SrcNew, N->getOperand(3), // Pg Base, Offset, InputVT}; return DAG.getNode(Opcode, DL, VTs, Ops); } static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { const EVT RetVT = N->getValueType(0); assert(RetVT.isScalableVector() && "Gather loads are only possible for SVE vectors"); SDLoc DL(N); // Make sure that the loaded data will fit into an SVE register if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) return SDValue(); // Depending on the addressing mode, this is either a pointer or a vector of // pointers (that fits into one register) SDValue Base = N->getOperand(3); // Depending on the addressing mode, this is either a single offset or a // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); // For "scalar + vector of indices", just scale the indices. This only // applies to non-temporal gathers because there's no instruction that takes // indicies. if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, RetVT.getScalarSizeInBits()); Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; } // In the case of non-temporal gather loads there's only one SVE instruction // per data-size: "scalar + vector", i.e. // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] // Since we do have intrinsics that allow the arguments to be in a different // order, we may need to swap them to match the spec. if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO && Offset.getValueType().isVector()) std::swap(Base, Offset); // GLD{FF}1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], // where #SizeInBytes is the size in bytes of the loaded items. For // immediates outside that range and non-immediate scalar offsets use // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { if (!isValidImmForSVEVecImmAddrMode(Offset, RetVT.getScalarSizeInBits() / 8)) { if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) ? AArch64ISD::GLD1_UXTW_MERGE_ZERO : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; else Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) ? AArch64ISD::GLD1_MERGE_ZERO : AArch64ISD::GLDFF1_MERGE_ZERO; std::swap(Base, Offset); } } auto &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Base.getValueType())) return SDValue(); // Some gather load variants allow unpacked offsets, but only as nxv2i32 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to // nxv2i64. Legalize accordingly. if (!OnlyPackedOffsets && Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); // Return value type that is representable in hardware EVT HwRetVt = getSVEContainerType(RetVT); // Keep the original output value type around - this is needed to be able to // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP // values we want the integer equivalent, so just use HwRetVT. SDValue OutVT = DAG.getValueType(RetVT); if (RetVT.isFloatingPoint()) OutVT = DAG.getValueType(HwRetVt); SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); SDValue Ops[] = {N->getOperand(0), // Chain N->getOperand(2), // Pg Base, Offset, OutVT}; SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); SDValue LoadChain = SDValue(Load.getNode(), 1); if (RetVT.isInteger() && (RetVT != HwRetVt)) Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); // If the original return value was FP, bitcast accordingly. Doing it here // means that we can avoid adding TableGen patterns for FPs. if (RetVT.isFloatingPoint()) Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); return DAG.getMergeValues({Load, LoadChain}, DL); } static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc DL(N); SDValue Src = N->getOperand(0); unsigned Opc = Src->getOpcode(); // Sign extend of an unsigned unpack -> signed unpack if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI : AArch64ISD::SUNPKLO; // Push the sign extend to the operand of the unpack // This is necessary where, for example, the operand of the unpack // is another unpack: // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) // -> // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) // -> // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) SDValue ExtOp = Src->getOperand(0); auto VT = cast(N->getOperand(1))->getVT(); EVT EltTy = VT.getVectorElementType(); (void)EltTy; assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && "Sign extending from an invalid type"); EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), ExtOp, DAG.getValueType(ExtVT)); return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (!EnableCombineMGatherIntrinsics) return SDValue(); // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; unsigned MemVTOpNum = 4; switch (Opc) { case AArch64ISD::LD1_MERGE_ZERO: NewOpc = AArch64ISD::LD1S_MERGE_ZERO; MemVTOpNum = 3; break; case AArch64ISD::LDNF1_MERGE_ZERO: NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; MemVTOpNum = 3; break; case AArch64ISD::LDFF1_MERGE_ZERO: NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; MemVTOpNum = 3; break; case AArch64ISD::GLD1_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; break; case AArch64ISD::GLD1_SCALED_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; break; case AArch64ISD::GLD1_SXTW_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; break; case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; break; case AArch64ISD::GLD1_UXTW_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; break; case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; break; case AArch64ISD::GLD1_IMM_MERGE_ZERO: NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; break; case AArch64ISD::GLDFF1_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; break; case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; break; case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; break; case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; break; case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; break; case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; break; case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; break; case AArch64ISD::GLDNT1_MERGE_ZERO: NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; break; default: return SDValue(); } EVT SignExtSrcVT = cast(N->getOperand(1))->getVT(); EVT SrcMemVT = cast(Src->getOperand(MemVTOpNum))->getVT(); if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) return SDValue(); EVT DstVT = N->getValueType(0); SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); SmallVector Ops; for (unsigned I = 0; I < Src->getNumOperands(); ++I) Ops.push_back(Src->getOperand(I)); SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); DCI.CombineTo(N, ExtLoad); DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1)); // Return N so it doesn't get rechecked return SDValue(N, 0); } /// Legalize the gather prefetch (scalar + vector addressing mode) when the /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset /// != nxv2i32) do not need legalization. static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { const unsigned OffsetPos = 4; SDValue Offset = N->getOperand(OffsetPos); // Not an unpacked vector, bail out. if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) return SDValue(); // Extend the unpacked offset vector to 64-bit lanes. SDLoc DL(N); Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); SmallVector Ops(N->op_begin(), N->op_end()); // Replace the offset operand with the 64-bit one. Ops[OffsetPos] = Offset; return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } /// Combines a node carrying the intrinsic /// `aarch64_sve_prf_gather_scalar_offset` into a node that uses /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to /// `aarch64_sve_prf_gather_scalar_offset` is not a valid immediate for the /// sve gather prefetch instruction with vector plus immediate addressing mode. static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes) { const unsigned ImmPos = 4, OffsetPos = 3; // No need to combine the node if the immediate is valid... if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) return SDValue(); // ...otherwise swap the offset base with the offset... SmallVector Ops(N->op_begin(), N->op_end()); std::swap(Ops[ImmPos], Ops[OffsetPos]); // ...and remap the intrinsic `aarch64_sve_prf_gather_scalar_offset` to // `aarch64_sve_prfb_gather_uxtw_index`. SDLoc DL(N); Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, MVT::i64); return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } // Return true if the vector operation can guarantee only the first lane of its // result contains data, with all bits in other lanes set to zero. static bool isLanes1toNKnownZero(SDValue Op) { switch (Op.getOpcode()) { default: return false; case AArch64ISD::ANDV_PRED: case AArch64ISD::EORV_PRED: case AArch64ISD::FADDA_PRED: case AArch64ISD::FADDV_PRED: case AArch64ISD::FMAXNMV_PRED: case AArch64ISD::FMAXV_PRED: case AArch64ISD::FMINNMV_PRED: case AArch64ISD::FMINV_PRED: case AArch64ISD::ORV_PRED: case AArch64ISD::SADDV_PRED: case AArch64ISD::SMAXV_PRED: case AArch64ISD::SMINV_PRED: case AArch64ISD::UADDV_PRED: case AArch64ISD::UMAXV_PRED: case AArch64ISD::UMINV_PRED: return true; } } static SDValue removeRedundantInsertVectorElt(SDNode *N) { assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); SDValue InsertVec = N->getOperand(0); SDValue InsertElt = N->getOperand(1); SDValue InsertIdx = N->getOperand(2); // We only care about inserts into the first element... if (!isNullConstant(InsertIdx)) return SDValue(); // ...of a zero'd vector... if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode())) return SDValue(); // ...where the inserted data was previously extracted... if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue ExtractVec = InsertElt.getOperand(0); SDValue ExtractIdx = InsertElt.getOperand(1); // ...from the first element of a vector. if (!isNullConstant(ExtractIdx)) return SDValue(); // If we get here we are effectively trying to zero lanes 1-N of a vector. // Ensure there's no type conversion going on. if (N->getValueType(0) != ExtractVec.getValueType()) return SDValue(); if (!isLanes1toNKnownZero(ExtractVec)) return SDValue(); // The explicit zeroing is redundant. return ExtractVec; } static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (SDValue Res = removeRedundantInsertVectorElt(N)) return Res; return performPostLD1Combine(N, DCI, true); } static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { EVT Ty = N->getValueType(0); if (Ty.isInteger()) return SDValue(); EVT IntTy = Ty.changeVectorElementTypeToInteger(); EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount()); if (ExtIntTy.getVectorElementType().getScalarSizeInBits() < IntTy.getVectorElementType().getScalarSizeInBits()) return SDValue(); SDLoc DL(N); SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)), DL, ExtIntTy); SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)), DL, ExtIntTy); SDValue Idx = N->getOperand(2); SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx); SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy); return DAG.getBitcast(Ty, Trunc); } static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) return SDValue(); auto hasValidElementTypeForFPExtLoad = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; }; // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) // We purposefully don't care about legality of the nodes here as we know // they can be split down into something legal. if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) && VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); DCI.CombineTo(N, ExtLoad); DCI.CombineTo( N0.getNode(), DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } return SDValue(); } static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); // Don't expand for NEON, SVE2 or SME if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME()) return SDValue(); SDLoc DL(N); SDValue Mask = N->getOperand(0); SDValue In1 = N->getOperand(1); SDValue In2 = N->getOperand(2); SDValue InvMask = DAG.getNOT(DL, Mask, VT); SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1); SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2); return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); } static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Insert = N->getOperand(0); if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) return SDValue(); if (!Insert.getOperand(0).isUndef()) return SDValue(); uint64_t IdxInsert = Insert.getConstantOperandVal(2); uint64_t IdxDupLane = N->getConstantOperandVal(1); if (IdxInsert != 0 || IdxDupLane != 0) return SDValue(); SDValue Bitcast = Insert.getOperand(1); if (Bitcast.getOpcode() != ISD::BITCAST) return SDValue(); SDValue Subvec = Bitcast.getOperand(0); EVT SubvecVT = Subvec.getValueType(); if (!SubvecVT.is128BitVector()) return SDValue(); EVT NewSubvecVT = getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType()); SDLoc DL(N); SDValue NewInsert = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT, DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2)); SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT, NewInsert, N->getOperand(1)); return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); } // Try to combine mull with uzp1. static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue ExtractHigh; SDValue ExtractLow; SDValue TruncHigh; SDValue TruncLow; SDLoc DL(N); // Check the operands are trunc and extract_high. if (isEssentiallyExtractHighSubvector(LHS) && RHS.getOpcode() == ISD::TRUNCATE) { TruncHigh = RHS; if (LHS.getOpcode() == ISD::BITCAST) ExtractHigh = LHS.getOperand(0); else ExtractHigh = LHS; } else if (isEssentiallyExtractHighSubvector(RHS) && LHS.getOpcode() == ISD::TRUNCATE) { TruncHigh = LHS; if (LHS.getOpcode() == ISD::BITCAST) ExtractHigh = RHS.getOperand(0); else ExtractHigh = RHS; } else return SDValue(); // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op // with uzp1. // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll SDValue TruncHighOp = TruncHigh.getOperand(0); EVT TruncHighOpVT = TruncHighOp.getValueType(); if (TruncHighOp.getOpcode() == AArch64ISD::DUP || DAG.isSplatValue(TruncHighOp, false)) return SDValue(); // Check there is other extract_high with same source vector. // For example, // // t18: v4i16 = extract_subvector t2, Constant:i64<0> // t12: v4i16 = truncate t11 // t31: v4i32 = AArch64ISD::SMULL t18, t12 // t23: v4i16 = extract_subvector t2, Constant:i64<4> // t16: v4i16 = truncate t15 // t30: v4i32 = AArch64ISD::SMULL t23, t1 // // This dagcombine assumes the two extract_high uses same source vector in // order to detect the pair of the mull. If they have different source vector, // this code will not work. bool HasFoundMULLow = true; SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0); if (ExtractHighSrcVec->use_size() != 2) HasFoundMULLow = false; // Find ExtractLow. for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) { if (User == ExtractHigh.getNode()) continue; if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !isNullConstant(User->getOperand(1))) { HasFoundMULLow = false; break; } ExtractLow.setNode(User); } if (!ExtractLow || !ExtractLow->hasOneUse()) HasFoundMULLow = false; // Check ExtractLow's user. if (HasFoundMULLow) { SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin(); if (ExtractLowUser->getOpcode() != N->getOpcode()) HasFoundMULLow = false; if (ExtractLowUser->getOperand(0) == ExtractLow) { if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE) TruncLow = ExtractLowUser->getOperand(1); else HasFoundMULLow = false; } else { if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE) TruncLow = ExtractLowUser->getOperand(0); else HasFoundMULLow = false; } } // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op // with uzp1. // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll EVT TruncHighVT = TruncHigh.getValueType(); EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext()); SDValue TruncLowOp = HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT); EVT TruncLowOpVT = TruncLowOp.getValueType(); if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP || DAG.isSplatValue(TruncLowOp, false))) return SDValue(); // Create uzp1, extract_high and extract_low. if (TruncHighOpVT != UZP1VT) TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp); if (TruncLowOpVT != UZP1VT) TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp); SDValue UZP1 = DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp); SDValue HighIdxCst = DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64); SDValue NewTruncHigh = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst); DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh); if (HasFoundMULLow) { EVT TruncLowVT = TruncLow.getValueType(); SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT, UZP1, ExtractLow.getOperand(1)); DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow); } return SDValue(N, 0); } static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (SDValue Val = tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG)) return Val; if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG)) return Val; return SDValue(); } SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: return performVecReduceBitwiseCombine(N, DCI, DAG); case ISD::ADD: case ISD::SUB: return performAddSubCombine(N, DCI); case ISD::BUILD_VECTOR: return performBuildVectorCombine(N, DCI, DAG); case ISD::TRUNCATE: return performTruncateCombine(N, DAG); case AArch64ISD::ANDS: return performFlagSettingCombine(N, DCI, ISD::AND); case AArch64ISD::ADC: if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) return R; return foldADCToCINC(N, DAG); case AArch64ISD::SBC: return foldOverflowCheck(N, DAG, /* IsAdd */ false); case AArch64ISD::ADCS: if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) return R; return performFlagSettingCombine(N, DCI, AArch64ISD::ADC); case AArch64ISD::SBCS: if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false)) return R; return performFlagSettingCombine(N, DCI, AArch64ISD::SBC); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, DCI, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget, *this); case ISD::AND: return performANDCombine(N, DCI); case ISD::FADD: return performFADDCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: return performExtendCombine(N, DCI, DAG); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::EXTRACT_SUBVECTOR: return performExtractSubvectorCombine(N, DCI, DAG); case ISD::INSERT_SUBVECTOR: return performInsertSubvectorCombine(N, DCI, DAG); case ISD::SELECT: return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::SETCC: return performSETCCCombine(N, DCI, DAG); case ISD::LOAD: return performLOADCombine(N, DCI, DAG, Subtarget); case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case ISD::MSTORE: return performMSTORECombine(N, DCI, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return performMaskedGatherScatterCombine(N, DCI, DAG); case ISD::VECTOR_SPLICE: return performSVESpliceCombine(N, DAG); case ISD::FP_EXTEND: return performFPExtendCombine(N, DAG, DCI, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: case AArch64ISD::TBZ: return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: return performDUPCombine(N, DCI); case AArch64ISD::DUPLANE128: return performDupLane128Combine(N, DAG); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: return performSpliceCombine(N, DAG); case AArch64ISD::UUNPKLO: case AArch64ISD::UUNPKHI: return performUnpackCombine(N, DAG, Subtarget); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); case AArch64ISD::SETCC_MERGE_ZERO: return performSetccMergeZeroCombine(N, DCI); case AArch64ISD::REINTERPRET_CAST: return performReinterpretCastCombine(N); case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: case AArch64ISD::GLD1_SXTW_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_IMM_MERGE_ZERO: case AArch64ISD::GLD1S_MERGE_ZERO: case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: case AArch64ISD::GLD1S_UXTW_MERGE_ZERO: case AArch64ISD::GLD1S_SXTW_MERGE_ZERO: case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO: case AArch64ISD::GLD1S_IMM_MERGE_ZERO: return performGLD1Combine(N, DAG); case AArch64ISD::VASHR: case AArch64ISD::VLSHR: return performVectorShiftCombine(N, *this, DCI); case AArch64ISD::SUNPKLO: return performSunpkloCombine(N, DAG); case AArch64ISD::BSP: return performBSPExpandForSVE(N, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI, Subtarget); case ISD::VECREDUCE_ADD: return performVecReduceAddCombine(N, DCI.DAG, Subtarget); case AArch64ISD::UADDV: return performUADDVCombine(N, DAG); case AArch64ISD::SMULL: case AArch64ISD::UMULL: case AArch64ISD::PMULL: return performMULLCombine(N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ld1rq: return performLD1ReplicateCombine(N, DAG); case Intrinsic::aarch64_sve_ld1ro: return performLD1ReplicateCombine(N, DAG); case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); case Intrinsic::aarch64_sve_ldnt1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); case Intrinsic::aarch64_sve_ldnt1_gather_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1: return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); case Intrinsic::aarch64_sve_ldnf1: return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); case Intrinsic::aarch64_sve_ldff1: return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); case Intrinsic::aarch64_sve_st1: return performST1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); case Intrinsic::aarch64_sve_stnt1_scatter: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); case Intrinsic::aarch64_sve_stnt1_scatter_index: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); case Intrinsic::aarch64_sve_ld1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SCALED_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_sxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); case Intrinsic::aarch64_sve_ldff1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); case Intrinsic::aarch64_sve_ldff1_gather_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); case Intrinsic::aarch64_sve_ldff1_gather_sxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ldff1_gather_uxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_IMM_MERGE_ZERO); case Intrinsic::aarch64_sve_st1_scatter: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); case Intrinsic::aarch64_sve_st1_scatter_index: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); case Intrinsic::aarch64_sve_st1_scatter_sxtw: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED_PRED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED_PRED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); case Intrinsic::aarch64_rndr: case Intrinsic::aarch64_rndrrs: { unsigned IntrinsicID = cast(N->getOperand(1))->getZExtValue(); auto Register = (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR : AArch64SysReg::RNDRRS); SDLoc DL(N); SDValue A = DAG.getNode( AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other), N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64)); SDValue B = DAG.getNode( AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); return DAG.getMergeValues( {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } default: break; } break; case ISD::GlobalAddress: return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); case ISD::CTLZ: return performCTLZCombine(N, DAG, Subtarget); } return SDValue(); } // Check if the return value is used as only a return value, as otherwise // we can't perform a tail-call. In particular, we need to check for // target ISD nodes that are returns and any other "odd" constructs // that the generic analysis code won't necessarily catch. bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode *Node : Copy->uses()) { if (Node->getOpcode() != AArch64ISD::RET_GLUE) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } // Return whether the an instruction can potentially be optimized to a tail // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, SelectionDAG &DAG) const { if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) return false; // Non-null if there is exactly one user of the loaded value (ignoring chain). SDNode *ValOnlyUser = nullptr; for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() == 1) continue; // Ignore chain. if (ValOnlyUser == nullptr) ValOnlyUser = *UI; else { ValOnlyUser = nullptr; // Multiple non-chain uses, bail out. break; } } auto IsUndefOrZero = [](SDValue V) { return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); }; // If the only user of the value is a scalable vector splat, it is // preferable to do a replicating load (ld1r*). if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() && (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR || (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU && IsUndefOrZero(ValOnlyUser->getOperand(2))))) return false; Base = Op->getOperand(0); // All of the indexed addressing mode instructions take a signed // 9 bit immediate offset. if (ConstantSDNode *RHS = dyn_cast(Op->getOperand(1))) { int64_t RHSC = RHS->getSExtValue(); if (Op->getOpcode() == ISD::SUB) RHSC = -(uint64_t)RHSC; if (!isInt<9>(RHSC)) return false; // Always emit pre-inc/post-inc addressing mode. Use negated constant offset // when dealing with subtraction. Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0)); return true; } return false; } bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG)) return false; AM = ISD::PRE_INC; return true; } bool AArch64TargetLowering::getPostIndexedAddressParts( SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; if (!getIndexedAddressParts(N, Op, Base, Offset, DAG)) return false; // Post-indexing updates the base, so it's not a valid transform // if that's not the same as the load's pointer. if (Ptr != Base) return false; AM = ISD::POST_INC; return true; } static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op = N->getOperand(0); EVT VT = N->getValueType(0); [[maybe_unused]] EVT SrcVT = Op.getValueType(); assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && "Must be bool vector."); // Special handling for Clang's __builtin_convertvector. For vectors with <8 // elements, it adds a vector concatenation with undef(s). If we encounter // this here, we can skip the concat. if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) { bool AllUndef = true; for (unsigned I = 1; I < Op.getNumOperands(); ++I) AllUndef &= Op.getOperand(I).isUndef(); if (AllUndef) Op = Op.getOperand(0); } SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG); if (VectorBits) Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT)); } static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT) { SDLoc DL(N); SDValue Op = N->getOperand(0); EVT VT = N->getValueType(0); // Use SCALAR_TO_VECTOR for lane zero SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op); SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec); SDValue IdxZero = DAG.getVectorIdxConstant(0, DL); Results.push_back( DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero)); return; } void AArch64TargetLowering::ReplaceBITCASTResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDLoc DL(N); SDValue Op = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = Op.getValueType(); if (VT == MVT::v2i16 && SrcVT == MVT::i32) { CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16); return; } if (VT == MVT::v4i8 && SrcVT == MVT::i32) { CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8); return; } if (VT == MVT::v2i8 && SrcVT == MVT::i16) { CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8); return; } if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && "Expected fp->int bitcast!"); // Bitcasting between unpacked vector types of different element counts is // not a NOP because the live elements are laid out differently. // 01234567 // e.g. nxv2i32 = XX??XX?? // nxv4f16 = X?X?X?X? if (VT.getVectorElementCount() != SrcVT.getVectorElementCount()) return; SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult)); return; } if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1) return replaceBoolVectorBitcast(N, Results, DAG); if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16)) return; Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op); Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.is256BitVector() || (VT.getScalarType().isFloatingPoint() && !N->getFlags().hasAllowReassociation()) || (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16())) return; SDValue X = N->getOperand(0); auto *Shuf = dyn_cast(N->getOperand(1)); if (!Shuf) { Shuf = dyn_cast(N->getOperand(0)); X = N->getOperand(1); if (!Shuf) return; } if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef()) return; // Check the mask is 1,0,3,2,5,4,... ArrayRef Mask = Shuf->getMask(); for (int I = 0, E = Mask.size(); I < E; I++) if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1)) return; SDLoc DL(N); auto LoHi = DAG.SplitVector(X, DL); assert(LoHi.first.getValueType() == LoHi.second.getValueType()); SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(), LoHi.first, LoHi.second); // Shuffle the elements back into order. SmallVector NMask; for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) { NMask.push_back(I); NMask.push_back(I); } Results.push_back( DAG.getVectorShuffle(VT, DL, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp, DAG.getUNDEF(LoHi.first.getValueType())), DAG.getUNDEF(VT), NMask)); } static void ReplaceReductionResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp) { EVT LoVT, HiVT; SDValue Lo, Hi; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); Results.push_back(SplitVal); } void AArch64TargetLowering::ReplaceExtractSubVectorResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); // Common code will handle these just fine. if (!InVT.isScalableVector() || !InVT.isInteger()) return; SDLoc DL(N); EVT VT = N->getValueType(0); // The following checks bail if this is not a halving operation. ElementCount ResEC = VT.getVectorElementCount(); if (InVT.getVectorElementCount() != (ResEC * 2)) return; auto *CIndex = dyn_cast(N->getOperand(1)); if (!CIndex) return; unsigned Index = CIndex->getZExtValue(); if ((Index != 0) && (Index != ResEC.getKnownMinValue())) return; unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); } // Create an even/odd pair of X registers holding integer value V. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), dl, MVT::i64); if (DAG.getDataLayout().isBigEndian()) std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; return SDValue( DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); } static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { assert(N->getValueType(0) == MVT::i128 && "AtomicCmpSwap on types less than 128 should be legal"); MachineMemOperand *MemOp = cast(N)->getMemOperand(); if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. SDValue Ops[] = { createGPRPairNode(DAG, N->getOperand(2)), // Compare value createGPRPairNode(DAG, N->getOperand(3)), // Store value N->getOperand(1), // Ptr N->getOperand(0), // Chain in }; unsigned Opcode; switch (MemOp->getMergedOrdering()) { case AtomicOrdering::Monotonic: Opcode = AArch64::CASPX; break; case AtomicOrdering::Acquire: Opcode = AArch64::CASPAX; break; case AtomicOrdering::Release: Opcode = AArch64::CASPLX; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: Opcode = AArch64::CASPALX; break; default: llvm_unreachable("Unexpected ordering!"); } MachineSDNode *CmpSwap = DAG.getMachineNode( Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); DAG.setNodeMemRefs(CmpSwap, {MemOp}); unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; if (DAG.getDataLayout().isBigEndian()) std::swap(SubReg1, SubReg2); SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, SDValue(CmpSwap, 0)); SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, SDValue(CmpSwap, 0)); Results.push_back( DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); Results.push_back(SDValue(CmpSwap, 1)); // Chain out return; } unsigned Opcode; switch (MemOp->getMergedOrdering()) { case AtomicOrdering::Monotonic: Opcode = AArch64::CMP_SWAP_128_MONOTONIC; break; case AtomicOrdering::Acquire: Opcode = AArch64::CMP_SWAP_128_ACQUIRE; break; case AtomicOrdering::Release: Opcode = AArch64::CMP_SWAP_128_RELEASE; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: Opcode = AArch64::CMP_SWAP_128; break; default: llvm_unreachable("Unexpected ordering!"); } SDLoc DL(N); auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64); auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64); SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, New.first, New.second, N->getOperand(0)}; SDNode *CmpSwap = DAG.getMachineNode( Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); Results.push_back(SDValue(CmpSwap, 3)); } static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering) { // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because // the type is not legal. Therefore we shouldn't expect to see a 128-bit // ATOMIC_LOAD_CLR at any point. assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR && "ATOMIC_LOAD_AND should be lowered to LDCLRP directly"); assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD"); assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB"); if (ISDOpcode == ISD::ATOMIC_LOAD_AND) { // The operand will need to be XORed in a separate step. switch (Ordering) { case AtomicOrdering::Monotonic: return AArch64::LDCLRP; break; case AtomicOrdering::Acquire: return AArch64::LDCLRPA; break; case AtomicOrdering::Release: return AArch64::LDCLRPL; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: return AArch64::LDCLRPAL; break; default: llvm_unreachable("Unexpected ordering!"); } } if (ISDOpcode == ISD::ATOMIC_LOAD_OR) { switch (Ordering) { case AtomicOrdering::Monotonic: return AArch64::LDSETP; break; case AtomicOrdering::Acquire: return AArch64::LDSETPA; break; case AtomicOrdering::Release: return AArch64::LDSETPL; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: return AArch64::LDSETPAL; break; default: llvm_unreachable("Unexpected ordering!"); } } if (ISDOpcode == ISD::ATOMIC_SWAP) { switch (Ordering) { case AtomicOrdering::Monotonic: return AArch64::SWPP; break; case AtomicOrdering::Acquire: return AArch64::SWPPA; break; case AtomicOrdering::Release: return AArch64::SWPPL; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: return AArch64::SWPPAL; break; default: llvm_unreachable("Unexpected ordering!"); } } llvm_unreachable("Unexpected ISDOpcode!"); } static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions // rather than the CASP instructions, because CASP has register classes for // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG // to present them as single operands. LSE128 instructions use the GPR64 // register class (because the pair does not have to be sequential), like // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR. assert(N->getValueType(0) == MVT::i128 && "AtomicLoadXXX on types less than 128 should be legal"); if (!Subtarget->hasLSE128()) return; MachineMemOperand *MemOp = cast(N)->getMemOperand(); const SDValue &Chain = N->getOperand(0); const SDValue &Ptr = N->getOperand(1); const SDValue &Val128 = N->getOperand(2); std::pair Val2x64 = DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64); const unsigned ISDOpcode = N->getOpcode(); const unsigned MachineOpcode = getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering()); if (ISDOpcode == ISD::ATOMIC_LOAD_AND) { SDLoc dl(Val128); Val2x64.first = DAG.getNode(ISD::XOR, dl, MVT::i64, DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first); Val2x64.second = DAG.getNode(ISD::XOR, dl, MVT::i64, DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second); } SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain}; if (DAG.getDataLayout().isBigEndian()) std::swap(Ops[0], Ops[1]); MachineSDNode *AtomicInst = DAG.getMachineNode(MachineOpcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops); DAG.setNodeMemRefs(AtomicInst, {MemOp}); SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1); if (DAG.getDataLayout().isBigEndian()) std::swap(Lo, Hi); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); Results.push_back(SDValue(AtomicInst, 2)); // Chain out } void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this"); case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; case ISD::ADD: case ISD::FADD: ReplaceAddWithADDP(N, Results, DAG, Subtarget); return; case ISD::CTPOP: case ISD::PARITY: if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG)) Results.push_back(Result); return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; case AArch64ISD::UADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); return; case AArch64ISD::SMINV: ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); return; case AArch64ISD::UMINV: ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); return; case AArch64ISD::SMAXV: ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); return; case AArch64ISD::UMAXV: ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); return; case ISD::MULHS: if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType())) Results.push_back( LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED)); return; case ISD::MULHU: if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType())) Results.push_back( LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED)); return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; case ISD::ATOMIC_LOAD_CLR: assert(N->getValueType(0) != MVT::i128 && "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP"); break; case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_SWAP: { assert(cast(N)->getVal().getValueType() == MVT::i128 && "Expected 128-bit atomicrmw."); // These need custom type legalisation so we go directly to instruction. ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget); return; } case ISD::ATOMIC_LOAD: case ISD::LOAD: { MemSDNode *LoadNode = cast(N); EVT MemVT = LoadNode->getMemoryVT(); // Handle lowering 256 bit non temporal loads into LDNP for little-endian // targets. if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() && MemVT.getSizeInBits() == 256u && (MemVT.getScalarSizeInBits() == 8u || MemVT.getScalarSizeInBits() == 16u || MemVT.getScalarSizeInBits() == 32u || MemVT.getScalarSizeInBits() == 64u)) { SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::LDNP, SDLoc(N), DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), MVT::Other}), {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), LoadNode->getMemOperand()); SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT, Result.getValue(0), Result.getValue(1)); Results.append({Pair, Result.getValue(2) /* Chain */}); return; } if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) || LoadNode->getMemoryVT() != MVT::i128) { // Non-volatile or atomic loads are optimized later in AArch64's load/store // optimizer. return; } if (SDValue(N, 0).getValueType() == MVT::i128) { auto *AN = dyn_cast(LoadNode); bool isLoadAcquire = AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire; unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP; if (isLoadAcquire) assert(Subtarget->hasFeature(AArch64::FeatureRCPC3)); SDValue Result = DAG.getMemIntrinsicNode( Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), LoadNode->getMemOperand()); SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Result.getValue(0), Result.getValue(1)); Results.append({Pair, Result.getValue(2) /* Chain */}); } return; } case ISD::EXTRACT_SUBVECTOR: ReplaceExtractSubVectorResults(N, Results, DAG); return; case ISD::INSERT_SUBVECTOR: case ISD::CONCAT_VECTORS: // Custom lowering has been requested for INSERT_SUBVECTOR and // CONCAT_VECTORS -- but delegate to common code for result type // legalisation return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && "custom lowering for unexpected type"); ConstantSDNode *CN = cast(N->getOperand(0)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::aarch64_sve_clasta_n: { SDLoc DL(N); auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, N->getOperand(1), Op2, N->getOperand(3)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); return; } case Intrinsic::aarch64_sve_clastb_n: { SDLoc DL(N); auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, N->getOperand(1), Op2, N->getOperand(3)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); return; } case Intrinsic::aarch64_sve_lasta: { SDLoc DL(N); auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, N->getOperand(1), N->getOperand(2)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); return; } case Intrinsic::aarch64_sve_lastb: { SDLoc DL(N); auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, N->getOperand(1), N->getOperand(2)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); return; } } } case ISD::READ_REGISTER: { SDLoc DL(N); assert(N->getValueType(0) == MVT::i128 && "READ_REGISTER custom lowering is only for 128-bit sysregs"); SDValue Chain = N->getOperand(0); SDValue SysRegName = N->getOperand(1); SDValue Result = DAG.getNode( AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), Chain, SysRegName); // Sysregs are not endian. Result.getValue(0) always contains the lower half // of the 128-bit System Register value. SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Result.getValue(0), Result.getValue(1)); Results.push_back(Pair); Results.push_back(Result.getValue(2)); // Chain return; } } } bool AArch64TargetLowering::useLoadStackGuardNode() const { if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) return TargetLowering::useLoadStackGuardNode(); return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. return 3; } TargetLoweringBase::LegalizeTypeAction AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, // v4i16, v2i32 instead of to promote. if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || VT == MVT::v1f32) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic // provided the address is 16-byte aligned. bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { if (!Subtarget->hasLSE2()) return false; if (auto LI = dyn_cast(I)) return LI->getType()->getPrimitiveSizeInBits() == 128 && LI->getAlign() >= Align(16); if (auto SI = dyn_cast(I)) return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && SI->getAlign() >= Align(16); return false; } bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const { if (!Subtarget->hasLSE128()) return false; // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP // will clobber the two registers. if (const auto *SI = dyn_cast(I)) return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && SI->getAlign() >= Align(16) && (SI->getOrdering() == AtomicOrdering::Release || SI->getOrdering() == AtomicOrdering::SequentiallyConsistent); if (const auto *RMW = dyn_cast(I)) return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 && RMW->getAlign() >= Align(16) && (RMW->getOperation() == AtomicRMWInst::Xchg || RMW->getOperation() == AtomicRMWInst::And || RMW->getOperation() == AtomicRMWInst::Or); return false; } bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const { if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3()) return false; if (auto LI = dyn_cast(I)) return LI->getType()->getPrimitiveSizeInBits() == 128 && LI->getAlign() >= Align(16) && LI->getOrdering() == AtomicOrdering::Acquire; if (auto SI = dyn_cast(I)) return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && SI->getAlign() >= Align(16) && SI->getOrdering() == AtomicOrdering::Release; return false; } bool AArch64TargetLowering::shouldInsertFencesForAtomic( const Instruction *I) const { if (isOpSuitableForRCPC3(I)) return false; if (isOpSuitableForLSE128(I)) return false; if (isOpSuitableForLDPSTP(I)) return true; return false; } bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore( const Instruction *I) const { // Store-Release instructions only provide seq_cst guarantees when paired with // Load-Acquire instructions. MSVC CRT does not use these instructions to // implement seq_cst loads and stores, so we need additional explicit fences // after memory writes. if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return false; switch (I->getOpcode()) { default: return false; case Instruction::AtomicCmpXchg: return cast(I)->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent; case Instruction::AtomicRMW: return cast(I)->getOrdering() == AtomicOrdering::SequentiallyConsistent; case Instruction::Store: return cast(I)->getOrdering() == AtomicOrdering::SequentiallyConsistent; } } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. TargetLoweringBase::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); if (Size != 128) return AtomicExpansionKind::None; if (isOpSuitableForRCPC3(SI)) return AtomicExpansionKind::None; if (isOpSuitableForLSE128(SI)) return AtomicExpansionKind::Expand; if (isOpSuitableForLDPSTP(SI)) return AtomicExpansionKind::None; return AtomicExpansionKind::Expand; } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); if (Size != 128) return AtomicExpansionKind::None; if (isOpSuitableForRCPC3(LI)) return AtomicExpansionKind::None; // No LSE128 loads if (isOpSuitableForLDPSTP(LI)) return AtomicExpansionKind::None; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement atomicrmw without spilling. If the target address is also on the // stack and close enough to the spill slot, this can lead to a situation // where the monitor always gets cleared and the atomic operation can never // succeed. So at -O0 lower this operation to a CAS loop. if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::CmpXChg; // Using CAS for an atomic load has a better chance of succeeding under high // contention situations. So use it if available. return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::LLSC; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (AI->isFloatingPointOperation()) return AtomicExpansionKind::CmpXChg; unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 && (AI->getOperation() == AtomicRMWInst::Xchg || AI->getOperation() == AtomicRMWInst::Or || AI->getOperation() == AtomicRMWInst::And); if (CanUseLSE128) return AtomicExpansionKind::None; // Nand is not supported in LSE. // Leave 128 bits to LLSC or CmpXChg. if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { if (Subtarget->hasLSE()) return AtomicExpansionKind::None; if (Subtarget->outlineAtomics()) { // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. // Don't outline them unless // (1) high level support approved: // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf // (2) low level libgcc and compiler-rt support implemented by: // min/max outline atomics helpers if (AI->getOperation() != AtomicRMWInst::Min && AI->getOperation() != AtomicRMWInst::Max && AI->getOperation() != AtomicRMWInst::UMin && AI->getOperation() != AtomicRMWInst::UMax) { return AtomicExpansionKind::None; } } } // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement atomicrmw without spilling. If the target address is also on the // stack and close enough to the spill slot, this can lead to a situation // where the monitor always gets cleared and the atomic operation can never // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if // we have a single CAS instruction that can replace the loop. if (getTargetMachine().getOptLevel() == CodeGenOpt::None || Subtarget->hasLSE()) return AtomicExpansionKind::CmpXChg; return AtomicExpansionKind::LLSC; } TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { // If subtarget has LSE, leave cmpxchg intact for codegen. if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) return AtomicExpansionKind::None; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::None; // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand // it. unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); if (Size > 64) return AtomicExpansionKind::None; return AtomicExpansionKind::LLSC; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsAcquire = isAcquireOrStronger(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a // single i128 here. if (ValueTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; Function *Ldxr = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); CallInst *CI = Builder.CreateCall(Ldxr, Addr); CI->addParamAttr( 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); return Builder.CreateBitCast(Trunc, ValueTy); } void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); Val = Builder.CreateBitCast(Val, IntValTy); CallInst *CI = Builder.CreateCall( Stxr, {Builder.CreateZExtOrBitCast( Val, Stxr->getFunctionType()->getParamType(0)), Addr}); CI->addParamAttr(1, Attribute::get(Builder.getContext(), Attribute::ElementType, Val->getType())); return CI; } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { if (!Ty->isArrayTy()) { const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); return TySize.isScalable() && TySize.getKnownMinValue() > 128; } // All non aggregate members of the type must have the same type SmallVector ValueVTs; ComputeValueVTs(*this, DL, Ty, ValueVTs); return all_equal(ValueVTs); } bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, EVT) const { return false; } static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset), IRB.getInt8PtrTy()->getPointerTo(0)); } Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { // Android provides a fixed TLS slot for the stack cookie. See the definition // of TLS_SLOT_STACK_GUARD in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x28); // Fuchsia is similar. // defines ZX_TLS_STACK_GUARD_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x10); return TargetLowering::getIRStackGuard(IRB); } void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( Subtarget->getSecurityCheckCookieName(), Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { F->setCallingConv(CallingConv::Win64); F->addParamAttr(0, Attribute::AttrKind::InReg); } return; } TargetLowering::insertSSPDeclarations(M); } Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { // MSVC CRT has a global variable holding security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getGlobalVariable("__security_cookie"); return TargetLowering::getSDagStackGuard(M); } Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getFunction(Subtarget->getSecurityCheckCookieName()); return TargetLowering::getSSPStackGuardCheck(M); } Value * AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x48); // Fuchsia is similar. // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x8); return TargetLowering::getSafeStackPointerLocation(IRB); } bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { // Only sink 'and' mask to cmp use block if it is masking a single bit, since // this is likely to be fold the and/cmp/br into a single tbz instruction. It // may be beneficial to sink in other cases, but we would have to check that // the cmp would not get folded into the br to form a cbz for these to be // beneficial. ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); if (!Mask) return false; return Mask->getValue().isPowerOf2(); } bool AArch64TargetLowering:: shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const { // Does baseline recommend not to perform the fold by default? if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) return false; // Else, if this is a vector shift, prefer 'shl'. return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; } TargetLowering::ShiftLegalizationStrategy AArch64TargetLowering::preferredShiftLegalizationStrategy( SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const { if (DAG.getMachineFunction().getFunction().hasMinSize() && !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin()) return ShiftLegalizationStrategy::LowerToLibcall; return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, ExpansionFactor); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void AArch64TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (AArch64::GPR64RegClass.contains(*I)) RC = &AArch64::GPR64RegClass; else if (AArch64::FPR64RegClass.contains(*I)) RC = &AArch64::FPR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. // The exception to this is vector division. Since AArch64 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasFnAttr(Attribute::MinSize); return OptSize && !VT.isVector(); } bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { // We want inc-of-add for scalars and sub-of-not for vectors. return VT.isScalarInteger(); } bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const { // v8f16 without fp16 need to be extended to v8f32, which is more difficult to // legalize. if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16()) return false; return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT); } MachineInstr * AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const { assert(MBBI->isCall() && MBBI->getCFIType() && "Invalid call instruction for a KCFI check"); switch (MBBI->getOpcode()) { case AArch64::BLR: case AArch64::BLRNoIP: case AArch64::TCRETURNri: case AArch64::TCRETURNriBTI: break; default: llvm_unreachable("Unexpected CFI call opcode"); } MachineOperand &Target = MBBI->getOperand(0); assert(Target.isReg() && "Invalid target operand for an indirect call"); Target.setIsRenamable(false); return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK)) .addReg(Target.getReg()) .addImm(MBBI->getCFIType()) .getInstr(); } bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); } unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) return getPointerTy(DL).getSizeInBits(); return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; } void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { MachineFrameInfo &MFI = MF.getFrameInfo(); // If we have any vulnerable SVE stack objects then the stack protector // needs to be placed at the top of the SVE stack area, as the SVE locals // are placed above the other locals, so we allocate it as if it were a // scalable vector. // FIXME: It may be worthwhile having a specific interface for this rather // than doing it here in finalizeLowering. if (MFI.hasStackProtectorIndex()) { for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { if (MFI.getStackID(i) == TargetStackID::ScalableVector && MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { MFI.setStackID(MFI.getStackProtectorIndex(), TargetStackID::ScalableVector); MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16)); break; } } } MFI.computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } // Unlike X86, we let frame lowering assign offsets to all catch objects. bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; } bool AArch64TargetLowering::shouldLocalize( const MachineInstr &MI, const TargetTransformInfo *TTI) const { auto &MF = *MI.getMF(); auto &MRI = MF.getRegInfo(); auto maxUses = [](unsigned RematCost) { // A cost of 1 means remats are basically free. if (RematCost == 1) return std::numeric_limits::max(); if (RematCost == 2) return 2U; // Remat is too expensive, only sink if there's one user. if (RematCost > 2) return 1U; llvm_unreachable("Unexpected remat cost"); }; switch (MI.getOpcode()) { case TargetOpcode::G_GLOBAL_VALUE: { // On Darwin, TLS global vars get selected into function calls, which // we don't want localized, as they can get moved into the middle of a // another call sequence. const GlobalValue &GV = *MI.getOperand(1).getGlobal(); if (GV.isThreadLocal() && Subtarget->isTargetMachO()) return false; return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure. } case TargetOpcode::G_CONSTANT: { auto *CI = MI.getOperand(1).getCImm(); APInt Imm = CI->getValue(); InstructionCost Cost = TTI->getIntImmCost( Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize); assert(Cost.isValid() && "Expected a valid imm cost"); unsigned RematCost = *Cost.getValue(); Register Reg = MI.getOperand(0).getReg(); unsigned MaxUses = maxUses(RematCost); // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs(). if (MaxUses == std::numeric_limits::max()) --MaxUses; return MRI.hasAtMostUserInstrs(Reg, MaxUses); } // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being // localizable. case AArch64::ADRP: case AArch64::G_ADD_LOW: // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too. case TargetOpcode::G_PTR_ADD: return true; default: break; } return TargetLoweringBase::shouldLocalize(MI, TTI); } bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (Inst.getType()->isScalableTy()) return true; for (unsigned i = 0; i < Inst.getNumOperands(); ++i) if (Inst.getOperand(i)->getType()->isScalableTy()) return true; if (const AllocaInst *AI = dyn_cast(&Inst)) { if (AI->getAllocatedType()->isScalableTy()) return true; } // Checks to allow the use of SME instructions if (auto *Base = dyn_cast(&Inst)) { auto CallerAttrs = SMEAttrs(*Inst.getFunction()); auto CalleeAttrs = SMEAttrs(*Base); if (CallerAttrs.requiresSMChange(CalleeAttrs, /*BodyOverridesInterface=*/false) || CallerAttrs.requiresLazySave(CalleeAttrs)) return true; } return false; } // Return the largest legal scalable vector type that matches VT's element type. static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { assert(VT.isFixedLengthVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Expected legal fixed length vector!"); switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { default: llvm_unreachable("unexpected element type for SVE container"); case MVT::i8: return EVT(MVT::nxv16i8); case MVT::i16: return EVT(MVT::nxv8i16); case MVT::i32: return EVT(MVT::nxv4i32); case MVT::i64: return EVT(MVT::nxv2i64); case MVT::f16: return EVT(MVT::nxv8f16); case MVT::f32: return EVT(MVT::nxv4f32); case MVT::f64: return EVT(MVT::nxv2f64); } } // Return a PTRUE with active lanes corresponding to the extent of VT. static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { assert(VT.isFixedLengthVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Expected legal fixed length vector!"); std::optional PgPattern = getSVEPredPatternFromNumElements(VT.getVectorNumElements()); assert(PgPattern && "Unexpected element count for SVE predicate"); // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use // AArch64SVEPredPattern::all, which can enable the use of unpredicated // variants of instructions when available. const auto &Subtarget = DAG.getSubtarget(); unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); if (MaxSVESize && MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) PgPattern = AArch64SVEPredPattern::all; MVT MaskVT; switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { default: llvm_unreachable("unexpected element type for SVE predicate"); case MVT::i8: MaskVT = MVT::nxv16i1; break; case MVT::i16: case MVT::f16: MaskVT = MVT::nxv8i1; break; case MVT::i32: case MVT::f32: MaskVT = MVT::nxv4i1; break; case MVT::i64: case MVT::f64: MaskVT = MVT::nxv2i1; break; } return getPTrue(DAG, DL, MaskVT, *PgPattern); } static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Expected legal scalable vector!"); auto PredTy = VT.changeVectorElementType(MVT::i1); return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); } static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { if (VT.isFixedLengthVector()) return getPredicateForFixedLengthVector(DAG, DL, VT); return getPredicateForScalableVector(DAG, DL, VT); } // Grow V to consume an entire SVE register. static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { assert(VT.isScalableVector() && "Expected to convert into a scalable vector!"); assert(V.getValueType().isFixedLengthVector() && "Expected a fixed length vector operand!"); SDLoc DL(V); SDValue Zero = DAG.getConstant(0, DL, MVT::i64); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); } // Shrink V so it's just big enough to maintain a VT's worth of data. static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { assert(VT.isFixedLengthVector() && "Expected to convert into a fixed length vector!"); assert(V.getValueType().isScalableVector() && "Expected a scalable vector operand!"); SDLoc DL(V); SDValue Zero = DAG.getConstant(0, DL, MVT::i64); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); } // Convert all fixed length vector loads larger than NEON to masked_loads. SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( SDValue Op, SelectionDAG &DAG) const { auto Load = cast(Op); SDLoc DL(Op); EVT VT = Op.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); EVT LoadVT = ContainerVT; EVT MemVT = Load->getMemoryVT(); auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); if (VT.isFloatingPoint()) { LoadVT = ContainerVT.changeTypeToInteger(); MemVT = MemVT.changeTypeToInteger(); } SDValue NewLoad = DAG.getMaskedLoad( LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg, DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(), Load->getAddressingMode(), Load->getExtensionType()); SDValue Result = NewLoad; if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { EVT ExtendVT = ContainerVT.changeVectorElementType( Load->getMemoryVT().getVectorElementType()); Result = getSVESafeBitCast(ExtendVT, Result, DAG); Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, Pg, Result, DAG.getUNDEF(ContainerVT)); } else if (VT.isFloatingPoint()) { Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result); } Result = convertFromScalableVector(DAG, VT, Result); SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; return DAG.getMergeValues(MergedValues, DL); } static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG) { SDLoc DL(Mask); EVT InVT = Mask.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); if (ISD::isBuildVectorAllOnes(Mask.getNode())) return Pg; auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); auto Op2 = DAG.getConstant(0, DL, ContainerVT); return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(), {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); } // Convert all fixed length vector loads larger than NEON to masked_loads. SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE( SDValue Op, SelectionDAG &DAG) const { auto Load = cast(Op); SDLoc DL(Op); EVT VT = Op.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG); SDValue PassThru; bool IsPassThruZeroOrUndef = false; if (Load->getPassThru()->isUndef()) { PassThru = DAG.getUNDEF(ContainerVT); IsPassThruZeroOrUndef = true; } else { if (ContainerVT.isInteger()) PassThru = DAG.getConstant(0, DL, ContainerVT); else PassThru = DAG.getConstantFP(0, DL, ContainerVT); if (isZerosVector(Load->getPassThru().getNode())) IsPassThruZeroOrUndef = true; } SDValue NewLoad = DAG.getMaskedLoad( ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), Load->getExtensionType()); SDValue Result = NewLoad; if (!IsPassThruZeroOrUndef) { SDValue OldPassThru = convertToScalableVector(DAG, ContainerVT, Load->getPassThru()); Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru); } Result = convertFromScalableVector(DAG, VT, Result); SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; return DAG.getMergeValues(MergedValues, DL); } // Convert all fixed length vector stores larger than NEON to masked_stores. SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( SDValue Op, SelectionDAG &DAG) const { auto Store = cast(Op); SDLoc DL(Op); EVT VT = Store->getValue().getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); EVT MemVT = Store->getMemoryVT(); auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); if (VT.isFloatingPoint() && Store->isTruncatingStore()) { EVT TruncVT = ContainerVT.changeVectorElementType( Store->getMemoryVT().getVectorElementType()); MemVT = MemVT.changeTypeToInteger(); NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg, NewValue, DAG.getTargetConstant(0, DL, MVT::i64), DAG.getUNDEF(TruncVT)); NewValue = getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); } else if (VT.isFloatingPoint()) { MemVT = MemVT.changeTypeToInteger(); NewValue = getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); } return DAG.getMaskedStore(Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), Pg, MemVT, Store->getMemOperand(), Store->getAddressingMode(), Store->isTruncatingStore()); } SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( SDValue Op, SelectionDAG &DAG) const { auto *Store = cast(Op); SDLoc DL(Op); EVT VT = Store->getValue().getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG); return DAG.getMaskedStore( Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), Mask, Store->getMemoryVT(), Store->getMemOperand(), Store->getAddressingMode(), Store->isTruncatingStore()); } SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); bool Signed = Op.getOpcode() == ISD::SDIV; unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; bool Negated; uint64_t SplatVal; if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32); SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT); SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2); if (Negated) Res = DAG.getNode(ISD::SUB, dl, ContainerVT, DAG.getConstant(0, dl, ContainerVT), Res); return convertFromScalableVector(DAG, VT, Res); } // Scalable vector i32/i64 DIV is supported. if (EltVT == MVT::i32 || EltVT == MVT::i64) return LowerToPredicatedOp(Op, DAG, PredOpcode); // Scalable vector i8/i16 DIV is not supported. Promote it to i32. EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; // If the wider type is legal: extend, op, and truncate. EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext()); if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) { SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0)); SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1)); SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1); return DAG.getNode(ISD::TRUNCATE, dl, VT, Div); } auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT, &ExtendOpcode](SDValue Op) { SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64); SDValue IdxHalf = DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero); SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf); return std::pair( {DAG.getNode(ExtendOpcode, dl, PromVT, Lo), DAG.getNode(ExtendOpcode, dl, PromVT, Hi)}); }; // If wider type is not legal: split, extend, op, trunc and concat. auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0)); auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1)); SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt); SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt); SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo); SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc}); } SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); SDLoc DL(Op); SDValue Val = Op.getOperand(0); EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); Val = convertToScalableVector(DAG, ContainerVT, Val); bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND; unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; // Repeatedly unpack Val until the result is of the desired element type. switch (ContainerVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unimplemented container type"); case MVT::nxv16i8: Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val); if (VT.getVectorElementType() == MVT::i16) break; [[fallthrough]]; case MVT::nxv8i16: Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val); if (VT.getVectorElementType() == MVT::i32) break; [[fallthrough]]; case MVT::nxv4i32: Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val); assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!"); break; } return convertFromScalableVector(DAG, VT, Val); } SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); SDLoc DL(Op); SDValue Val = Op.getOperand(0); EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); Val = convertToScalableVector(DAG, ContainerVT, Val); // Repeatedly truncate Val until the result is of the desired element type. switch (ContainerVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unimplemented container type"); case MVT::nxv2i64: Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); if (VT.getVectorElementType() == MVT::i32) break; [[fallthrough]]; case MVT::nxv4i32: Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); if (VT.getVectorElementType() == MVT::i16) break; [[fallthrough]]; case MVT::nxv8i16: Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); break; } return convertFromScalableVector(DAG, VT, Val); } SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT InVT = Op.getOperand(0).getValueType(); assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!"); SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); SDLoc DL(Op); EVT InVT = Op.getOperand(0).getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0, Op.getOperand(1), Op.getOperand(2)); return convertFromScalableVector(DAG, VT, ScalableRes); } // Convert vector operation 'Op' to an equivalent predicated operation whereby // the original operation's type is used to construct a suitable predicate. // NOTE: The results for inactive lanes are undefined. SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const { EVT VT = Op.getValueType(); SDLoc DL(Op); auto Pg = getPredicateForVector(DAG, DL, VT); if (VT.isFixedLengthVector()) { assert(isTypeLegal(VT) && "Expected only legal fixed-width types"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); // Create list of operands by converting existing ones to scalable types. SmallVector Operands = {Pg}; for (const SDValue &V : Op->op_values()) { if (isa(V)) { Operands.push_back(V); continue; } if (const VTSDNode *VTNode = dyn_cast(V)) { EVT VTArg = VTNode->getVT().getVectorElementType(); EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg); Operands.push_back(DAG.getValueType(NewVTArg)); continue; } assert(isTypeLegal(V.getValueType()) && "Expected only legal fixed-width types"); Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); } if (isMergePassthruOpcode(NewOp)) Operands.push_back(DAG.getUNDEF(ContainerVT)); auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); return convertFromScalableVector(DAG, VT, ScalableRes); } assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); SmallVector Operands = {Pg}; for (const SDValue &V : Op->op_values()) { assert((!V.getValueType().isVector() || V.getValueType().isScalableVector()) && "Only scalable vectors are supported!"); Operands.push_back(V); } if (isMergePassthruOpcode(NewOp)) Operands.push_back(DAG.getUNDEF(VT)); return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); } // If a fixed length vector operation has no side effects when applied to // undefined elements, we can safely use scalable vectors to perform the same // operation without needing to worry about predication. SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && isTypeLegal(VT) && "Only expected to lower fixed length vector operation!"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); // Create list of operands by converting existing ones to scalable types. SmallVector Ops; for (const SDValue &V : Op->op_values()) { assert(!isa(V) && "Unexpected VTSDNode node!"); // Pass through non-vector operands. if (!V.getValueType().isVector()) { Ops.push_back(V); continue; } // "cast" fixed length vector to a scalable vector. assert(V.getValueType().isFixedLengthVector() && isTypeLegal(V.getValueType()) && "Only fixed length vectors are supported!"); Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); } auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops); return convertFromScalableVector(DAG, VT, ScalableRes); } SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const { SDLoc DL(ScalarOp); SDValue AccOp = ScalarOp.getOperand(0); SDValue VecOp = ScalarOp.getOperand(1); EVT SrcVT = VecOp.getValueType(); EVT ResVT = SrcVT.getVectorElementType(); EVT ContainerVT = SrcVT; if (SrcVT.isFixedLengthVector()) { ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); } SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); SDValue Zero = DAG.getConstant(0, DL, MVT::i64); // Convert operands to Scalable. AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, DAG.getUNDEF(ContainerVT), AccOp, Zero); // Perform reduction. SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, Pg, AccOp, VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); } SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, SelectionDAG &DAG) const { SDLoc DL(ReduceOp); SDValue Op = ReduceOp.getOperand(0); EVT OpVT = Op.getValueType(); EVT VT = ReduceOp.getValueType(); if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) return SDValue(); SDValue Pg = getPredicateForVector(DAG, DL, OpVT); switch (ReduceOp.getOpcode()) { default: return SDValue(); case ISD::VECREDUCE_OR: if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1) // The predicate can be 'Op' because // vecreduce_or(Op & ) <=> vecreduce_or(Op). return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE); else return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); case ISD::VECREDUCE_AND: { Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); } case ISD::VECREDUCE_XOR: { SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); if (OpVT == MVT::nxv1i1) { // Emulate a CNTP on .Q using .D and a different governing predicate. Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg); Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op); } SDValue Cntp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); return DAG.getAnyExtOrTrunc(Cntp, DL, VT); } } return SDValue(); } SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp, SelectionDAG &DAG) const { SDLoc DL(ScalarOp); SDValue VecOp = ScalarOp.getOperand(0); EVT SrcVT = VecOp.getValueType(); if (useSVEForFixedLengthVectorVT( SrcVT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); } // UADDV always returns an i64 result. EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : SrcVT.getVectorElementType(); EVT RdxVT = SrcVT; if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED) RdxVT = getPackedSVEVectorVT(ResVT); SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, DAG.getConstant(0, DL, MVT::i64)); // The VEC_REDUCE nodes expect an element size result. if (ResVT != ScalarOp.getValueType()) Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType()); return Res; } SDValue AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); EVT InVT = Op.getOperand(1).getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); // Convert the mask to a predicated (NOTE: We don't need to worry about // inactive lanes since VSELECT is safe when given undefined elements). EVT MaskVT = Op.getOperand(0).getValueType(); EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); Mask = DAG.getNode(ISD::TRUNCATE, DL, MaskContainerVT.changeVectorElementType(MVT::i1), Mask); auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, Mask, Op1, Op2); return convertFromScalableVector(DAG, VT, ScalableRes); } SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT InVT = Op.getOperand(0).getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) && "Only expected to lower fixed length vector operation!"); assert(Op.getValueType() == InVT.changeTypeToInteger() && "Expected integer result of the same bit length as the inputs!"); auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); EVT CmpVT = Pg.getValueType(); auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, {Pg, Op1, Op2, Op.getOperand(2)}); EVT PromoteVT = ContainerVT.changeTypeToInteger(); auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT); return convertFromScalableVector(DAG, Op.getValueType(), Promote); } SDValue AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); auto SrcOp = Op.getOperand(0); EVT VT = Op.getValueType(); EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcOp.getValueType()); SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp); Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp); return convertFromScalableVector(DAG, VT, Op); } SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE( SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); unsigned NumOperands = Op->getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); auto SrcOp1 = Op.getOperand(0); auto SrcOp2 = Op.getOperand(1); EVT VT = Op.getValueType(); EVT SrcVT = SrcOp1.getValueType(); if (NumOperands > 2) { SmallVector Ops; EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext()); for (unsigned I = 0; I < NumOperands; I += 2) Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT, Op->getOperand(I), Op->getOperand(I + 1))); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); } EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1); SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2); Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2); return convertFromScalableVector(DAG, VT, Op); } SDValue AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); SDLoc DL(Op); SDValue Val = Op.getOperand(0); SDValue Pg = getPredicateForVector(DAG, DL, VT); EVT SrcVT = Val.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); EVT ExtendVT = ContainerVT.changeVectorElementType( SrcVT.getVectorElementType()); Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val); Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val); Val = getSVESafeBitCast(ExtendVT, Val, DAG); Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, Pg, Val, DAG.getUNDEF(ContainerVT)); return convertFromScalableVector(DAG, VT, Val); } SDValue AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); SDLoc DL(Op); SDValue Val = Op.getOperand(0); EVT SrcVT = Val.getValueType(); EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); EVT RoundVT = ContainerSrcVT.changeVectorElementType( VT.getVectorElementType()); SDValue Pg = getPredicateForVector(DAG, DL, RoundVT); Val = convertToScalableVector(DAG, ContainerSrcVT, Val); Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val, Op.getOperand(1), DAG.getUNDEF(RoundVT)); Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG); Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); return DAG.getNode(ISD::BITCAST, DL, VT, Val); } SDValue AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP; unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; SDLoc DL(Op); SDValue Val = Op.getOperand(0); EVT SrcVT = Val.getValueType(); EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); if (VT.bitsGE(SrcVT)) { SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT); Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT.changeTypeToInteger(), Val); // Safe to use a larger than specified operand because by promoting the // value nothing has changed from an arithmetic point of view. Val = convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val); Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, DAG.getUNDEF(ContainerDstVT)); return convertFromScalableVector(DAG, VT, Val); } else { EVT CvtVT = ContainerSrcVT.changeVectorElementType( ContainerDstVT.getVectorElementType()); SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); Val = convertToScalableVector(DAG, ContainerSrcVT, Val); Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG); Val = convertFromScalableVector(DAG, SrcVT, Val); Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); return DAG.getNode(ISD::BITCAST, DL, VT, Val); } } SDValue AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT OpVT = Op.getValueType(); assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0), Op.getOperand(1)); SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0), Op.getOperand(1)); return DAG.getMergeValues({Even, Odd}, DL); } SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT OpVT = Op.getValueType(); assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_INTERLEAVE."); SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0), Op.getOperand(1)); SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0), Op.getOperand(1)); return DAG.getMergeValues({Lo, Hi}, DL); } SDValue AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU : AArch64ISD::FCVTZU_MERGE_PASSTHRU; SDLoc DL(Op); SDValue Val = Op.getOperand(0); EVT SrcVT = Val.getValueType(); EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); if (VT.bitsGT(SrcVT)) { EVT CvtVT = ContainerDstVT.changeVectorElementType( ContainerSrcVT.getVectorElementType()); SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT); Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val); Val = convertToScalableVector(DAG, ContainerDstVT, Val); Val = getSVESafeBitCast(CvtVT, Val, DAG); Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, DAG.getUNDEF(ContainerDstVT)); return convertFromScalableVector(DAG, VT, Val); } else { EVT CvtVT = ContainerSrcVT.changeTypeToInteger(); SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); // Safe to use a larger than specified result since an fp_to_int where the // result doesn't fit into the destination is undefined. Val = convertToScalableVector(DAG, ContainerSrcVT, Val); Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); return DAG.getNode(ISD::TRUNCATE, DL, VT, Val); } } SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); auto *SVN = cast(Op.getNode()); auto ShuffleMask = SVN->getMask(); SDLoc DL(Op); SDValue Op1 = Op.getOperand(0); SDValue Op2 = Op.getOperand(1); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); Op1 = convertToScalableVector(DAG, ContainerVT, Op1); Op2 = convertToScalableVector(DAG, ContainerVT, Op2); auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT { if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16) return MVT::i32; return ScalarTy; }; if (SVN->isSplat()) { unsigned Lane = std::max(0, SVN->getSplatIndex()); EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType()); SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, DAG.getConstant(Lane, DL, MVT::i64)); Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl); return convertFromScalableVector(DAG, VT, Op); } bool ReverseEXT = false; unsigned Imm; if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) && Imm == VT.getVectorNumElements() - 1) { if (ReverseEXT) std::swap(Op1, Op2); EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType()); SDValue Scalar = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64)); Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar); return convertFromScalableVector(DAG, VT, Op); } for (unsigned LaneSize : {64U, 32U, 16U}) { if (isREVMask(ShuffleMask, VT, LaneSize)) { EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize)); unsigned RevOp; unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 8) RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU; else if (EltSz == 16) RevOp = AArch64ISD::REVH_MERGE_PASSTHRU; else RevOp = AArch64ISD::REVW_MERGE_PASSTHRU; Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); Op = LowerToPredicatedOp(Op, DAG, RevOp); Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); return convertFromScalableVector(DAG, VT, Op); } } if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 && isREVMask(ShuffleMask, VT, 128)) { if (!VT.isFloatingPoint()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU); EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64)); Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU); Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); return convertFromScalableVector(DAG, VT, Op); } unsigned WhichResult; if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0) return convertFromScalableVector( DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2)); if (isTRNMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return convertFromScalableVector( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0) return convertFromScalableVector( DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1)); if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return convertFromScalableVector( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask // represents the same logical operation as performed by a ZIP instruction. In // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly // equivalent to an AArch64 instruction. There's the extra component of // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions // only operated on 64/128bit vector types that have a direct mapping to a // target register and so an exact mapping is implied. // However, when using SVE for fixed length vectors, most legal vector types // are actually sub-vectors of a larger SVE register. When mapping // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider // how the mask's indices translate. Specifically, when the mapping requires // an exact meaning for a specific vector index (e.g. Index X is the last // vector element in the register) then such mappings are often only safe when // the exact SVE register size is know. The main exception to this is when // indices are logically relative to the first element of either // ISD::VECTOR_SHUFFLE operand because these relative indices don't change // when converting from fixed-length to scalable vector types (i.e. the start // of a fixed length vector is always the start of a scalable vector). unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits(); if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) { if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) { Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); return convertFromScalableVector(DAG, VT, Op); } if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0) return convertFromScalableVector( DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2)); if (isUZPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return convertFromScalableVector( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0) return convertFromScalableVector( DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1)); if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return convertFromScalableVector( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } } return SDValue(); } SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT InVT = Op.getValueType(); assert(VT.isScalableVector() && isTypeLegal(VT) && InVT.isScalableVector() && isTypeLegal(InVT) && "Only expect to cast between legal scalable vector types!"); assert(VT.getVectorElementType() != MVT::i1 && InVT.getVectorElementType() != MVT::i1 && "For predicate bitcasts, use getSVEPredicateBitCast"); if (InVT == VT) return Op; EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); // Safe bitcasting between unpacked vector types of different element counts // is currently unsupported because the following is missing the necessary // work to ensure the result's elements live where they're supposed to within // an SVE register. // 01234567 // e.g. nxv2i32 = XX??XX?? // nxv4f16 = X?X?X?X? assert((VT.getVectorElementCount() == InVT.getVectorElementCount() || VT == PackedVT || InVT == PackedInVT) && "Unexpected bitcast!"); // Pack input if required. if (InVT != PackedInVT) Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op); // Unpack result if required. if (VT != PackedVT) Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); return Op; } bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG, SDValue N) const { return ::isAllActivePredicate(DAG, N); } EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const { return ::getPromotedVTForPredicate(VT); } bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { unsigned Opc = Op.getOpcode(); switch (Opc) { case AArch64ISD::VSHL: { // Match (VSHL (VLSHR Val X) X) SDValue ShiftL = Op; SDValue ShiftR = Op->getOperand(0); if (ShiftR->getOpcode() != AArch64ISD::VLSHR) return false; if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse()) return false; unsigned ShiftLBits = ShiftL->getConstantOperandVal(1); unsigned ShiftRBits = ShiftR->getConstantOperandVal(1); // Other cases can be handled as well, but this is not // implemented. if (ShiftRBits != ShiftLBits) return false; unsigned ScalarSize = Op.getScalarValueSizeInBits(); assert(ScalarSize > ShiftLBits && "Invalid shift imm"); APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits); APInt UnusedBits = ~OriginalDemandedBits; if ((ZeroBits & UnusedBits) != ZeroBits) return false; // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not // used - simplify to just Val. return TLO.CombineTo(Op, ShiftR->getOperand(0)); } case ISD::INTRINSIC_WO_CHAIN: { if (auto ElementSize = IsSVECntIntrinsic(Op)) { unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits(); if (!MaxSVEVectorSizeInBits) MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector; unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize; // The SVE count intrinsics don't support the multiplier immediate so we // don't have to account for that here. The value returned may be slightly // over the true required bits, as this is based on the "ALL" pattern. The // other patterns are also exposed by these intrinsics, but they all // return a value that's strictly less than "ALL". unsigned RequiredBits = llvm::bit_width(MaxElements); unsigned BitWidth = Known.Zero.getBitWidth(); if (RequiredBits < BitWidth) Known.Zero.setHighBits(BitWidth - RequiredBits); return false; } } } return TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { return Op.getOpcode() == AArch64ISD::DUP || Op.getOpcode() == AArch64ISD::MOVI || (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || TargetLowering::isTargetCanonicalConstantNode(Op); } bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal( unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); } bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { return Subtarget->hasSVE() || Subtarget->hasSVE2() || Subtarget->hasComplxNum(); } bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation Operation, Type *Ty) const { auto *VTy = dyn_cast(Ty); if (!VTy) return false; // If the vector is scalable, SVE is enabled, implying support for complex // numbers. Otherwirse, we need to ensure complex number support is avaialble if (!VTy->isScalableTy() && !Subtarget->hasComplxNum()) return false; auto *ScalarTy = VTy->getScalarType(); unsigned NumElements = VTy->getElementCount().getKnownMinValue(); // We can only process vectors that have a bit size of 128 or higher (with an // additional 64 bits for Neon). Additionally, these vectors must have a // power-of-2 size, as we later split them into the smallest supported size // and merging them back together after applying complex operation. unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) || !llvm::isPowerOf2_32(VTyWidth)) return false; if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) { unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); return 8 <= ScalarWidth && ScalarWidth <= 64; } return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); } Value *AArch64TargetLowering::createComplexDeinterleavingIR( IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); bool IsScalable = Ty->isScalableTy(); bool IsInt = Ty->getElementType()->isIntegerTy(); unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue(); assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) && "Vector type must be either 64 or a power of 2 that is at least 128"); if (TyWidth > 128) { int Stride = Ty->getElementCount().getKnownMinValue() / 2; auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride)); auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; if (Accumulator) { LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); UpperSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); } auto *LowerSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, B.getInt64(0)); return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { if (Accumulator == nullptr) Accumulator = Constant::getNullValue(Ty); if (IsScalable) { if (IsInt) return B.CreateIntrinsic( Intrinsic::aarch64_sve_cmla_x, Ty, {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); auto *Mask = B.getAllOnesMask(Ty->getElementCount()); return B.CreateIntrinsic( Intrinsic::aarch64_sve_fcmla, Ty, {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); } Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, Intrinsic::aarch64_neon_vcmla_rot90, Intrinsic::aarch64_neon_vcmla_rot180, Intrinsic::aarch64_neon_vcmla_rot270}; return B.CreateIntrinsic(IdMap[(int)Rotation], Ty, {Accumulator, InputB, InputA}); } if (OperationType == ComplexDeinterleavingOperation::CAdd) { if (IsScalable) { if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || Rotation == ComplexDeinterleavingRotation::Rotation_270) { if (IsInt) return B.CreateIntrinsic( Intrinsic::aarch64_sve_cadd_x, Ty, {InputA, InputB, B.getInt32((int)Rotation * 90)}); auto *Mask = B.getAllOnesMask(Ty->getElementCount()); return B.CreateIntrinsic( Intrinsic::aarch64_sve_fcadd, Ty, {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)}); } return nullptr; } Intrinsic::ID IntId = Intrinsic::not_intrinsic; if (Rotation == ComplexDeinterleavingRotation::Rotation_90) IntId = Intrinsic::aarch64_neon_vcadd_rot90; else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) IntId = Intrinsic::aarch64_neon_vcadd_rot270; if (IntId == Intrinsic::not_intrinsic) return nullptr; return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); } return nullptr; } bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { unsigned Opc = N->getOpcode(); if (ISD::isExtOpcode(Opc)) { if (any_of(N->uses(), [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; })) return false; } return true; } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 24390f1b54f6..5b8f1b00dc03 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1,3166 +1,3168 @@ //===- JumpThreading.cpp - Thread control through conditional blocks ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the Jump Threading pass. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include #include #include #include #include using namespace llvm; using namespace jumpthreading; #define DEBUG_TYPE "jump-threading" STATISTIC(NumThreads, "Number of jumps threaded"); STATISTIC(NumFolds, "Number of terminators folded"); STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); static cl::opt BBDuplicateThreshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); static cl::opt ImplicationSearchThreshold( "jump-threading-implication-search-threshold", cl::desc("The number of predecessors to search for a stronger " "condition to use to thread over a weaker condition"), cl::init(3), cl::Hidden); static cl::opt PhiDuplicateThreshold( "jump-threading-phi-threshold", cl::desc("Max PHIs in BB to duplicate for jump threading"), cl::init(76), cl::Hidden); static cl::opt PrintLVIAfterJumpThreading( "print-lvi-after-jump-threading", cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false), cl::Hidden); static cl::opt ThreadAcrossLoopHeaders( "jump-threading-across-loop-headers", cl::desc("Allow JumpThreading to thread across loop headers, for testing"), cl::init(false), cl::Hidden); JumpThreadingPass::JumpThreadingPass(int T) { DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); } // Update branch probability information according to conditional // branch probability. This is usually made possible for cloned branches // in inline instances by the context specific profile in the caller. // For instance, // // [Block PredBB] // [Branch PredBr] // if (t) { // Block A; // } else { // Block B; // } // // [Block BB] // cond = PN([true, %A], [..., %B]); // PHI node // [Branch CondBr] // if (cond) { // ... // P(cond == true) = 1% // } // // Here we know that when block A is taken, cond must be true, which means // P(cond == true | A) = 1 // // Given that P(cond == true) = P(cond == true | A) * P(A) + // P(cond == true | B) * P(B) // we get: // P(cond == true ) = P(A) + P(cond == true | B) * P(B) // // which gives us: // P(A) is less than P(cond == true), i.e. // P(t == true) <= P(cond == true) // // In other words, if we know P(cond == true) is unlikely, we know // that P(t == true) is also unlikely. // static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { BranchInst *CondBr = dyn_cast(BB->getTerminator()); if (!CondBr) return; uint64_t TrueWeight, FalseWeight; if (!extractBranchWeights(*CondBr, TrueWeight, FalseWeight)) return; if (TrueWeight + FalseWeight == 0) // Zero branch_weights do not give a hint for getting branch probabilities. // Technically it would result in division by zero denominator, which is // TrueWeight + FalseWeight. return; // Returns the outgoing edge of the dominating predecessor block // that leads to the PhiNode's incoming block: auto GetPredOutEdge = [](BasicBlock *IncomingBB, BasicBlock *PhiBB) -> std::pair { auto *PredBB = IncomingBB; auto *SuccBB = PhiBB; SmallPtrSet Visited; while (true) { BranchInst *PredBr = dyn_cast(PredBB->getTerminator()); if (PredBr && PredBr->isConditional()) return {PredBB, SuccBB}; Visited.insert(PredBB); auto *SinglePredBB = PredBB->getSinglePredecessor(); if (!SinglePredBB) return {nullptr, nullptr}; // Stop searching when SinglePredBB has been visited. It means we see // an unreachable loop. if (Visited.count(SinglePredBB)) return {nullptr, nullptr}; SuccBB = PredBB; PredBB = SinglePredBB; } }; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *PhiOpnd = PN->getIncomingValue(i); ConstantInt *CI = dyn_cast(PhiOpnd); if (!CI || !CI->getType()->isIntegerTy(1)) continue; BranchProbability BP = (CI->isOne() ? BranchProbability::getBranchProbability( TrueWeight, TrueWeight + FalseWeight) : BranchProbability::getBranchProbability( FalseWeight, TrueWeight + FalseWeight)); auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB); if (!PredOutEdge.first) return; BasicBlock *PredBB = PredOutEdge.first; BranchInst *PredBr = dyn_cast(PredBB->getTerminator()); if (!PredBr) return; uint64_t PredTrueWeight, PredFalseWeight; // FIXME: We currently only set the profile data when it is missing. // With PGO, this can be used to refine even existing profile data with // context information. This needs to be done after more performance // testing. if (extractBranchWeights(*PredBr, PredTrueWeight, PredFalseWeight)) continue; // We can not infer anything useful when BP >= 50%, because BP is the // upper bound probability value. if (BP >= BranchProbability(50, 100)) continue; SmallVector Weights; if (PredBr->getSuccessor(0) == PredOutEdge.second) { Weights.push_back(BP.getNumerator()); Weights.push_back(BP.getCompl().getNumerator()); } else { Weights.push_back(BP.getCompl().getNumerator()); Weights.push_back(BP.getNumerator()); } PredBr->setMetadata(LLVMContext::MD_prof, MDBuilder(PredBr->getParent()->getContext()) .createBranchWeights(Weights)); } } PreservedAnalyses JumpThreadingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TTI = AM.getResult(F); // Jump Threading has no sense for the targets with divergent CF if (TTI.hasBranchDivergence(&F)) return PreservedAnalyses::all(); auto &TLI = AM.getResult(F); auto &LVI = AM.getResult(F); auto &AA = AM.getResult(F); auto &DT = AM.getResult(F); bool Changed = runImpl(F, &AM, &TLI, &TTI, &LVI, &AA, std::make_unique( &DT, nullptr, DomTreeUpdater::UpdateStrategy::Lazy), std::nullopt, std::nullopt); if (PrintLVIAfterJumpThreading) { dbgs() << "LVI for function '" << F.getName() << "':\n"; LVI.printLVI(F, getDomTreeUpdater()->getDomTree(), dbgs()); } if (!Changed) return PreservedAnalyses::all(); getDomTreeUpdater()->flush(); #if defined(EXPENSIVE_CHECKS) assert(getDomTreeUpdater()->getDomTree().verify( DominatorTree::VerificationLevel::Full) && "DT broken after JumpThreading"); assert((!getDomTreeUpdater()->hasPostDomTree() || getDomTreeUpdater()->getPostDomTree().verify( PostDominatorTree::VerificationLevel::Full)) && "PDT broken after JumpThreading"); #else assert(getDomTreeUpdater()->getDomTree().verify( DominatorTree::VerificationLevel::Fast) && "DT broken after JumpThreading"); assert((!getDomTreeUpdater()->hasPostDomTree() || getDomTreeUpdater()->getPostDomTree().verify( PostDominatorTree::VerificationLevel::Fast)) && "PDT broken after JumpThreading"); #endif return getPreservedAnalysis(); } bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_, TargetLibraryInfo *TLI_, TargetTransformInfo *TTI_, LazyValueInfo *LVI_, AliasAnalysis *AA_, std::unique_ptr DTU_, std::optional BFI_, std::optional BPI_) { LLVM_DEBUG(dbgs() << "Jump threading on function '" << F_.getName() << "'\n"); F = &F_; FAM = FAM_; TLI = TLI_; TTI = TTI_; LVI = LVI_; AA = AA_; DTU = std::move(DTU_); BFI = BFI_; BPI = BPI_; auto *GuardDecl = F->getParent()->getFunction( Intrinsic::getName(Intrinsic::experimental_guard)); HasGuards = GuardDecl && !GuardDecl->use_empty(); // Reduce the number of instructions duplicated when optimizing strictly for // size. if (BBDuplicateThreshold.getNumOccurrences()) BBDupThreshold = BBDuplicateThreshold; else if (F->hasFnAttribute(Attribute::MinSize)) BBDupThreshold = 3; else BBDupThreshold = DefaultBBDupThreshold; // JumpThreading must not processes blocks unreachable from entry. It's a // waste of compute time and can potentially lead to hangs. SmallPtrSet Unreachable; assert(DTU && "DTU isn't passed into JumpThreading before using it."); assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed."); DominatorTree &DT = DTU->getDomTree(); for (auto &BB : *F) if (!DT.isReachableFromEntry(&BB)) Unreachable.insert(&BB); if (!ThreadAcrossLoopHeaders) findLoopHeaders(*F); bool EverChanged = false; bool Changed; do { Changed = false; for (auto &BB : *F) { if (Unreachable.count(&BB)) continue; while (processBlock(&BB)) // Thread all of the branches we can over BB. Changed = ChangedSinceLastAnalysisUpdate = true; // Jump threading may have introduced redundant debug values into BB // which should be removed. if (Changed) RemoveRedundantDbgInstrs(&BB); // Stop processing BB if it's the entry or is now deleted. The following // routines attempt to eliminate BB and locating a suitable replacement // for the entry is non-trivial. if (&BB == &F->getEntryBlock() || DTU->isBBPendingDeletion(&BB)) continue; if (pred_empty(&BB)) { // When processBlock makes BB unreachable it doesn't bother to fix up // the instructions in it. We must remove BB to prevent invalid IR. LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName() << "' with terminator: " << *BB.getTerminator() << '\n'); LoopHeaders.erase(&BB); LVI->eraseBlock(&BB); DeleteDeadBlock(&BB, DTU.get()); Changed = ChangedSinceLastAnalysisUpdate = true; continue; } // processBlock doesn't thread BBs with unconditional TIs. However, if BB // is "almost empty", we attempt to merge BB with its sole successor. auto *BI = dyn_cast(BB.getTerminator()); if (BI && BI->isUnconditional()) { BasicBlock *Succ = BI->getSuccessor(0); if ( // The terminator must be the only non-phi instruction in BB. BB.getFirstNonPHIOrDbg(true)->isTerminator() && // Don't alter Loop headers and latches to ensure another pass can // detect and transform nested loops later. !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) && TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU.get())) { RemoveRedundantDbgInstrs(Succ); // BB is valid for cleanup here because we passed in DTU. F remains // BB's parent until a DTU->getDomTree() event. LVI->eraseBlock(&BB); Changed = ChangedSinceLastAnalysisUpdate = true; } } } EverChanged |= Changed; } while (Changed); LoopHeaders.clear(); return EverChanged; } // Replace uses of Cond with ToVal when safe to do so. If all uses are // replaced, we can remove Cond. We cannot blindly replace all uses of Cond // because we may incorrectly replace uses when guards/assumes are uses of // of `Cond` and we used the guards/assume to reason about the `Cond` value // at the end of block. RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. static bool replaceFoldableUses(Instruction *Cond, Value *ToVal, BasicBlock *KnownAtEndOfBB) { bool Changed = false; assert(Cond->getType() == ToVal->getType()); // We can unconditionally replace all uses in non-local blocks (i.e. uses // strictly dominated by BB), since LVI information is true from the // terminator of BB. if (Cond->getParent() == KnownAtEndOfBB) Changed |= replaceNonLocalUsesWith(Cond, ToVal); for (Instruction &I : reverse(*KnownAtEndOfBB)) { // Reached the Cond whose uses we are trying to replace, so there are no // more uses. if (&I == Cond) break; // We only replace uses in instructions that are guaranteed to reach the end // of BB, where we know Cond is ToVal. if (!isGuaranteedToTransferExecutionToSuccessor(&I)) break; Changed |= I.replaceUsesOfWith(Cond, ToVal); } if (Cond->use_empty() && !Cond->mayHaveSideEffects()) { Cond->eraseFromParent(); Changed = true; } return Changed; } /// Return the cost of duplicating a piece of this block from first non-phi /// and before StopAt instruction to thread across it. Stop scanning the block /// when exceeding the threshold. If duplication is impossible, returns ~0U. static unsigned getJumpThreadDuplicationCost(const TargetTransformInfo *TTI, BasicBlock *BB, Instruction *StopAt, unsigned Threshold) { assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); // Do not duplicate the BB if it has a lot of PHI nodes. // If a threadable chain is too long then the number of PHI nodes can add up, // leading to a substantial increase in compile time when rewriting the SSA. unsigned PhiCount = 0; Instruction *FirstNonPHI = nullptr; for (Instruction &I : *BB) { if (!isa(&I)) { FirstNonPHI = &I; break; } if (++PhiCount > PhiDuplicateThreshold) return ~0U; } /// Ignore PHI nodes, these will be flattened when duplication happens. BasicBlock::const_iterator I(FirstNonPHI); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. unsigned Bonus = 0; if (BB->getTerminator() == StopAt) { // Threading through a switch statement is particularly profitable. If this // block ends in a switch, decrease its cost to make it more likely to // happen. if (isa(StopAt)) Bonus = 6; // The same holds for indirect branches, but slightly more so. if (isa(StopAt)) Bonus = 8; } // Bump the threshold up so the early exit from the loop doesn't skip the // terminator-based Size adjustment at the end. Threshold += Bonus; // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; for (; &*I != StopAt; ++I) { // Stop scanning the block if we've reached the threshold. if (Size > Threshold) return Size; // Bail out if this instruction gives back a token type, it is not possible // to duplicate it if it is used outside this BB. if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) return ~0U; // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. if (const CallInst *CI = dyn_cast(I)) if (CI->cannotDuplicate() || CI->isConvergent()) return ~0U; if (TTI->getInstructionCost(&*I, TargetTransformInfo::TCK_SizeAndLatency) == TargetTransformInfo::TCC_Free) continue; // All other instructions count for at least one unit. ++Size; // Calls are more expensive. If they are non-intrinsic calls, we model them // as having cost of 4. If they are a non-vector intrinsic, we model them // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast(I)) { if (!isa(CI)) Size += 3; else if (!CI->getType()->isVectorTy()) Size += 1; } } return Size > Bonus ? Size - Bonus : 0; } /// findLoopHeaders - We do not want jump threading to turn proper loop /// structures into irreducible loops. Doing this breaks up the loop nesting /// hierarchy and pessimizes later transformations. To prevent this from /// happening, we first have to find the loop headers. Here we approximate this /// by finding targets of backedges in the CFG. /// /// Note that there definitely are cases when we want to allow threading of /// edges across a loop header. For example, threading a jump from outside the /// loop (the preheader) to an exit block of the loop is definitely profitable. /// It is also almost always profitable to thread backedges from within the loop /// to exit blocks, and is often profitable to thread backedges to other blocks /// within the loop (forming a nested loop). This simple analysis is not rich /// enough to track all of these properties and keep it up-to-date as the CFG /// mutates, so we don't allow any of these transformations. void JumpThreadingPass::findLoopHeaders(Function &F) { SmallVector, 32> Edges; FindFunctionBackedges(F, Edges); for (const auto &Edge : Edges) LoopHeaders.insert(Edge.second); } /// getKnownConstant - Helper method to determine if we can thread over a /// terminator with the given value as its condition, and if so what value to /// use for that. What kind of value this is depends on whether we want an /// integer or a block address, but an undef is always accepted. /// Returns null if Val is null or not an appropriate constant. static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { if (!Val) return nullptr; // Undef is "known" enough. if (UndefValue *U = dyn_cast(Val)) return U; if (Preference == WantBlockAddress) return dyn_cast(Val->stripPointerCasts()); return dyn_cast(Val); } /// computeValueKnownInPredecessors - Given a basic block BB and a value V, see /// if we can infer that the value is a known ConstantInt/BlockAddress or undef /// in any of our predecessors. If so, return the known list of value and pred /// BB in the result vector. /// /// This returns true if there were any known values. bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( Value *V, BasicBlock *BB, PredValueInfo &Result, ConstantPreference Preference, DenseSet &RecursionSet, Instruction *CxtI) { // This method walks up use-def chains recursively. Because of this, we could // get into an infinite loop going around loops in the use-def chain. To // prevent this, keep track of what (value, block) pairs we've already visited // and terminate the search if we loop back to them if (!RecursionSet.insert(V).second) return false; // If V is a constant, then it is known in all predecessors. if (Constant *KC = getKnownConstant(V, Preference)) { for (BasicBlock *Pred : predecessors(BB)) Result.emplace_back(KC, Pred); return !Result.empty(); } // If V is a non-instruction value, or an instruction in a different block, // then it can't be derived from a PHI. Instruction *I = dyn_cast(V); if (!I || I->getParent() != BB) { // Okay, if this is a live-in value, see if it has a known value at the any // edge from our predecessors. for (BasicBlock *P : predecessors(BB)) { using namespace PatternMatch; // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI); // If I is a non-local compare-with-constant instruction, use more-rich // 'getPredicateOnEdge' method. This would be able to handle value // inequalities better, for example if the compare is "X < 4" and "X < 3" // is known true but "X < 4" itself is not available. CmpInst::Predicate Pred; Value *Val; Constant *Cst; if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst)))) { auto Res = LVI->getPredicateOnEdge(Pred, Val, Cst, P, BB, CxtI); if (Res != LazyValueInfo::Unknown) PredCst = ConstantInt::getBool(V->getContext(), Res); } if (Constant *KC = getKnownConstant(PredCst, Preference)) Result.emplace_back(KC, P); } return !Result.empty(); } /// If I is a PHI node, then we know the incoming values for any constants. if (PHINode *PN = dyn_cast(I)) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *InVal = PN->getIncomingValue(i); if (Constant *KC = getKnownConstant(InVal, Preference)) { Result.emplace_back(KC, PN->getIncomingBlock(i)); } else { Constant *CI = LVI->getConstantOnEdge(InVal, PN->getIncomingBlock(i), BB, CxtI); if (Constant *KC = getKnownConstant(CI, Preference)) Result.emplace_back(KC, PN->getIncomingBlock(i)); } } return !Result.empty(); } // Handle Cast instructions. if (CastInst *CI = dyn_cast(I)) { Value *Source = CI->getOperand(0); computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, RecursionSet, CxtI); if (Result.empty()) return false; // Convert the known values. for (auto &R : Result) R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType()); return true; } if (FreezeInst *FI = dyn_cast(I)) { Value *Source = FI->getOperand(0); computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, RecursionSet, CxtI); erase_if(Result, [](auto &Pair) { return !isGuaranteedNotToBeUndefOrPoison(Pair.first); }); return !Result.empty(); } // Handle some boolean conditions. if (I->getType()->getPrimitiveSizeInBits() == 1) { using namespace PatternMatch; if (Preference != WantInteger) return false; // X | true -> true // X & false -> false Value *Op0, *Op1; if (match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))) || match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) { PredValueInfoTy LHSVals, RHSVals; computeValueKnownInPredecessorsImpl(Op0, BB, LHSVals, WantInteger, RecursionSet, CxtI); computeValueKnownInPredecessorsImpl(Op1, BB, RHSVals, WantInteger, RecursionSet, CxtI); if (LHSVals.empty() && RHSVals.empty()) return false; ConstantInt *InterestingVal; if (match(I, m_LogicalOr())) InterestingVal = ConstantInt::getTrue(I->getContext()); else InterestingVal = ConstantInt::getFalse(I->getContext()); SmallPtrSet LHSKnownBBs; // Scan for the sentinel. If we find an undef, force it to the // interesting value: x|undef -> true and x&undef -> false. for (const auto &LHSVal : LHSVals) if (LHSVal.first == InterestingVal || isa(LHSVal.first)) { Result.emplace_back(InterestingVal, LHSVal.second); LHSKnownBBs.insert(LHSVal.second); } for (const auto &RHSVal : RHSVals) if (RHSVal.first == InterestingVal || isa(RHSVal.first)) { // If we already inferred a value for this block on the LHS, don't // re-add it. if (!LHSKnownBBs.count(RHSVal.second)) Result.emplace_back(InterestingVal, RHSVal.second); } return !Result.empty(); } // Handle the NOT form of XOR. if (I->getOpcode() == Instruction::Xor && isa(I->getOperand(1)) && cast(I->getOperand(1))->isOne()) { computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result, WantInteger, RecursionSet, CxtI); if (Result.empty()) return false; // Invert the known values. for (auto &R : Result) R.first = ConstantExpr::getNot(R.first); return true; } // Try to simplify some other binary operator values. } else if (BinaryOperator *BO = dyn_cast(I)) { if (Preference != WantInteger) return false; if (ConstantInt *CI = dyn_cast(BO->getOperand(1))) { const DataLayout &DL = BO->getModule()->getDataLayout(); PredValueInfoTy LHSVals; computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals, WantInteger, RecursionSet, CxtI); // Try to use constant folding to simplify the binary operator. for (const auto &LHSVal : LHSVals) { Constant *V = LHSVal.first; Constant *Folded = ConstantFoldBinaryOpOperands(BO->getOpcode(), V, CI, DL); if (Constant *KC = getKnownConstant(Folded, WantInteger)) Result.emplace_back(KC, LHSVal.second); } } return !Result.empty(); } // Handle compare with phi operand, where the PHI is defined in this block. if (CmpInst *Cmp = dyn_cast(I)) { if (Preference != WantInteger) return false; Type *CmpType = Cmp->getType(); Value *CmpLHS = Cmp->getOperand(0); Value *CmpRHS = Cmp->getOperand(1); CmpInst::Predicate Pred = Cmp->getPredicate(); PHINode *PN = dyn_cast(CmpLHS); if (!PN) PN = dyn_cast(CmpRHS); if (PN && PN->getParent() == BB) { const DataLayout &DL = PN->getModule()->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. // See if any do. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = PN->getIncomingBlock(i); Value *LHS, *RHS; if (PN == CmpLHS) { LHS = PN->getIncomingValue(i); RHS = CmpRHS->DoPHITranslation(BB, PredBB); } else { LHS = CmpLHS->DoPHITranslation(BB, PredBB); RHS = PN->getIncomingValue(i); } Value *Res = simplifyCmpInst(Pred, LHS, RHS, {DL}); if (!Res) { if (!isa(RHS)) continue; // getPredicateOnEdge call will make no sense if LHS is defined in BB. auto LHSInst = dyn_cast(LHS); if (LHSInst && LHSInst->getParent() == BB) continue; LazyValueInfo::Tristate ResT = LVI->getPredicateOnEdge(Pred, LHS, cast(RHS), PredBB, BB, CxtI ? CxtI : Cmp); if (ResT == LazyValueInfo::Unknown) continue; Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); } if (Constant *KC = getKnownConstant(Res, WantInteger)) Result.emplace_back(KC, PredBB); } return !Result.empty(); } // If comparing a live-in value against a constant, see if we know the // live-in value on any predecessors. if (isa(CmpRHS) && !CmpType->isVectorTy()) { Constant *CmpConst = cast(CmpRHS); if (!isa(CmpLHS) || cast(CmpLHS)->getParent() != BB) { for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(Pred, CmpLHS, CmpConst, P, BB, CxtI ? CxtI : Cmp); if (Res == LazyValueInfo::Unknown) continue; Constant *ResC = ConstantInt::get(CmpType, Res); Result.emplace_back(ResC, P); } return !Result.empty(); } // InstCombine can fold some forms of constant range checks into // (icmp (add (x, C1)), C2). See if we have we have such a thing with // x as a live-in. { using namespace PatternMatch; Value *AddLHS; ConstantInt *AddConst; if (isa(CmpConst) && match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) { if (!isa(AddLHS) || cast(AddLHS)->getParent() != BB) { for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a ConstantRange in // a predecessor, use that information to try to thread this // block. ConstantRange CR = LVI->getConstantRangeOnEdge( AddLHS, P, BB, CxtI ? CxtI : cast(CmpLHS)); // Propagate the range through the addition. CR = CR.add(AddConst->getValue()); // Get the range where the compare returns true. ConstantRange CmpRange = ConstantRange::makeExactICmpRegion( Pred, cast(CmpConst)->getValue()); Constant *ResC; if (CmpRange.contains(CR)) ResC = ConstantInt::getTrue(CmpType); else if (CmpRange.inverse().contains(CR)) ResC = ConstantInt::getFalse(CmpType); else continue; Result.emplace_back(ResC, P); } return !Result.empty(); } } } // Try to find a constant value for the LHS of a comparison, // and evaluate it statically if we can. PredValueInfoTy LHSVals; computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals, WantInteger, RecursionSet, CxtI); for (const auto &LHSVal : LHSVals) { Constant *V = LHSVal.first; Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst); if (Constant *KC = getKnownConstant(Folded, WantInteger)) Result.emplace_back(KC, LHSVal.second); } return !Result.empty(); } } if (SelectInst *SI = dyn_cast(I)) { // Handle select instructions where at least one operand is a known constant // and we can figure out the condition value for any predecessor block. Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference); Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); PredValueInfoTy Conds; if ((TrueVal || FalseVal) && computeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds, WantInteger, RecursionSet, CxtI)) { for (auto &C : Conds) { Constant *Cond = C.first; // Figure out what value to use for the condition. bool KnownCond; if (ConstantInt *CI = dyn_cast(Cond)) { // A known boolean. KnownCond = CI->isOne(); } else { assert(isa(Cond) && "Unexpected condition value"); // Either operand will do, so be sure to pick the one that's a known // constant. // FIXME: Do this more cleverly if both values are known constants? KnownCond = (TrueVal != nullptr); } // See if the select has a known constant value for this predecessor. if (Constant *Val = KnownCond ? TrueVal : FalseVal) Result.emplace_back(Val, C.second); } return !Result.empty(); } } // If all else fails, see if LVI can figure out a constant value for us. assert(CxtI->getParent() == BB && "CxtI should be in BB"); Constant *CI = LVI->getConstant(V, CxtI); if (Constant *KC = getKnownConstant(CI, Preference)) { for (BasicBlock *Pred : predecessors(BB)) Result.emplace_back(KC, Pred); } return !Result.empty(); } /// GetBestDestForBranchOnUndef - If we determine that the specified block ends /// in an undefined jump, decide which block is best to revector to. /// /// Since we can pick an arbitrary destination, we pick the successor with the /// fewest predecessors. This should reduce the in-degree of the others. static unsigned getBestDestForJumpOnUndef(BasicBlock *BB) { Instruction *BBTerm = BB->getTerminator(); unsigned MinSucc = 0; BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); // Compute the successor with the minimum number of predecessors. unsigned MinNumPreds = pred_size(TestBB); for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) { TestBB = BBTerm->getSuccessor(i); unsigned NumPreds = pred_size(TestBB); if (NumPreds < MinNumPreds) { MinSucc = i; MinNumPreds = NumPreds; } } return MinSucc; } static bool hasAddressTakenAndUsed(BasicBlock *BB) { if (!BB->hasAddressTaken()) return false; // If the block has its address taken, it may be a tree of dead constants // hanging off of it. These shouldn't keep the block alive. BlockAddress *BA = BlockAddress::get(BB); BA->removeDeadConstantUsers(); return !BA->use_empty(); } /// processBlock - If there are any predecessors whose control can be threaded /// through to a successor, transform them now. bool JumpThreadingPass::processBlock(BasicBlock *BB) { // If the block is trivially dead, just return and let the caller nuke it. // This simplifies other transformations. if (DTU->isBBPendingDeletion(BB) || (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock())) return false; // If this block has a single predecessor, and if that pred has a single // successor, merge the blocks. This encourages recursive jump threading // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (maybeMergeBasicBlockIntoOnlyPred(BB)) return true; if (tryToUnfoldSelectInCurrBB(BB)) return true; // Look if we can propagate guards to predecessors. if (HasGuards && processGuards(BB)) return true; // What kind of constant we're looking for. ConstantPreference Preference = WantInteger; // Look to see if the terminator is a conditional branch, switch or indirect // branch, if not we can't thread it. Value *Condition; Instruction *Terminator = BB->getTerminator(); if (BranchInst *BI = dyn_cast(Terminator)) { // Can't thread an unconditional jump. if (BI->isUnconditional()) return false; Condition = BI->getCondition(); } else if (SwitchInst *SI = dyn_cast(Terminator)) { Condition = SI->getCondition(); } else if (IndirectBrInst *IB = dyn_cast(Terminator)) { // Can't thread indirect branch with no successors. if (IB->getNumSuccessors() == 0) return false; Condition = IB->getAddress()->stripPointerCasts(); Preference = WantBlockAddress; } else { return false; // Must be an invoke or callbr. } // Keep track if we constant folded the condition in this invocation. bool ConstantFolded = false; // Run constant folding to see if we can reduce the condition to a simple // constant. if (Instruction *I = dyn_cast(Condition)) { Value *SimpleVal = ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); if (isInstructionTriviallyDead(I, TLI)) I->eraseFromParent(); Condition = SimpleVal; ConstantFolded = true; } } // If the terminator is branching on an undef or freeze undef, we can pick any // of the successors to branch to. Let getBestDestForJumpOnUndef decide. auto *FI = dyn_cast(Condition); if (isa(Condition) || (FI && isa(FI->getOperand(0)) && FI->hasOneUse())) { unsigned BestSucc = getBestDestForJumpOnUndef(BB); std::vector Updates; // Fold the branch/switch. Instruction *BBTerm = BB->getTerminator(); Updates.reserve(BBTerm->getNumSuccessors()); for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { if (i == BestSucc) continue; BasicBlock *Succ = BBTerm->getSuccessor(i); Succ->removePredecessor(BB, true); Updates.push_back({DominatorTree::Delete, BB, Succ}); } LLVM_DEBUG(dbgs() << " In block '" << BB->getName() << "' folding undef terminator: " << *BBTerm << '\n'); BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); ++NumFolds; BBTerm->eraseFromParent(); DTU->applyUpdatesPermissive(Updates); if (FI) FI->eraseFromParent(); return true; } // If the terminator of this block is branching on a constant, simplify the // terminator to an unconditional branch. This can occur due to threading in // other blocks. if (getKnownConstant(Condition, Preference)) { LLVM_DEBUG(dbgs() << " In block '" << BB->getName() << "' folding terminator: " << *BB->getTerminator() << '\n'); ++NumFolds; ConstantFoldTerminator(BB, true, nullptr, DTU.get()); if (auto *BPI = getBPI()) BPI->eraseBlock(BB); return true; } Instruction *CondInst = dyn_cast(Condition); // All the rest of our checks depend on the condition being an instruction. if (!CondInst) { // FIXME: Unify this with code below. if (processThreadableEdges(Condition, BB, Preference, Terminator)) return true; return ConstantFolded; } // Some of the following optimization can safely work on the unfrozen cond. Value *CondWithoutFreeze = CondInst; if (auto *FI = dyn_cast(CondInst)) CondWithoutFreeze = FI->getOperand(0); if (CmpInst *CondCmp = dyn_cast(CondWithoutFreeze)) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons // against a constant at this time. if (Constant *CondConst = dyn_cast(CondCmp->getOperand(1))) { LazyValueInfo::Tristate Ret = LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), CondConst, BB->getTerminator(), /*UseBlockValue=*/false); if (Ret != LazyValueInfo::Unknown) { // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This // is because we use the guards/assume to reason about the `Cond` value // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. auto *CI = Ret == LazyValueInfo::True ? ConstantInt::getTrue(CondCmp->getType()) : ConstantInt::getFalse(CondCmp->getType()); if (replaceFoldableUses(CondCmp, CI, BB)) return true; } // We did not manage to simplify this branch, try to see whether // CondCmp depends on a known phi-select pattern. if (tryToUnfoldSelect(CondCmp, BB)) return true; } } if (SwitchInst *SI = dyn_cast(BB->getTerminator())) if (tryToUnfoldSelect(SI, BB)) return true; // Check for some cases that are worth simplifying. Right now we want to look // for loads that are used by a switch or by the condition for the branch. If // we see one, check to see if it's partially redundant. If so, insert a PHI // which can then be used to thread the values. Value *SimplifyValue = CondWithoutFreeze; if (CmpInst *CondCmp = dyn_cast(SimplifyValue)) if (isa(CondCmp->getOperand(1))) SimplifyValue = CondCmp->getOperand(0); // TODO: There are other places where load PRE would be profitable, such as // more complex comparisons. if (LoadInst *LoadI = dyn_cast(SimplifyValue)) if (simplifyPartiallyRedundantLoad(LoadI)) return true; // Before threading, try to propagate profile data backwards: if (PHINode *PN = dyn_cast(CondInst)) if (PN->getParent() == BB && isa(BB->getTerminator())) updatePredecessorProfileMetadata(PN, BB); // Handle a variety of cases where we are branching on something derived from // a PHI node in the current block. If we can prove that any predecessors // compute a predictable value based on a PHI node, thread those predecessors. if (processThreadableEdges(CondInst, BB, Preference, Terminator)) return true; // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in // the current block, see if we can simplify. PHINode *PN = dyn_cast(CondWithoutFreeze); if (PN && PN->getParent() == BB && isa(BB->getTerminator())) return processBranchOnPHI(PN); // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. if (CondInst->getOpcode() == Instruction::Xor && CondInst->getParent() == BB && isa(BB->getTerminator())) return processBranchOnXOR(cast(CondInst)); // Search for a stronger dominating condition that can be used to simplify a // conditional branch leaving BB. if (processImpliedCondition(BB)) return true; return false; } bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) { auto *BI = dyn_cast(BB->getTerminator()); if (!BI || !BI->isConditional()) return false; Value *Cond = BI->getCondition(); // Assuming that predecessor's branch was taken, if pred's branch condition // (V) implies Cond, Cond can be either true, undef, or poison. In this case, // freeze(Cond) is either true or a nondeterministic value. // If freeze(Cond) has only one use, we can freely fold freeze(Cond) to true // without affecting other instructions. auto *FICond = dyn_cast(Cond); if (FICond && FICond->hasOneUse()) Cond = FICond->getOperand(0); else FICond = nullptr; BasicBlock *CurrentBB = BB; BasicBlock *CurrentPred = BB->getSinglePredecessor(); unsigned Iter = 0; auto &DL = BB->getModule()->getDataLayout(); while (CurrentPred && Iter++ < ImplicationSearchThreshold) { auto *PBI = dyn_cast(CurrentPred->getTerminator()); if (!PBI || !PBI->isConditional()) return false; if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB) return false; bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB; std::optional Implication = isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue); // If the branch condition of BB (which is Cond) and CurrentPred are // exactly the same freeze instruction, Cond can be folded into CondIsTrue. if (!Implication && FICond && isa(PBI->getCondition())) { if (cast(PBI->getCondition())->getOperand(0) == FICond->getOperand(0)) Implication = CondIsTrue; } if (Implication) { BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1); BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0); RemoveSucc->removePredecessor(BB); BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI); UncondBI->setDebugLoc(BI->getDebugLoc()); ++NumFolds; BI->eraseFromParent(); if (FICond) FICond->eraseFromParent(); DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}}); if (auto *BPI = getBPI()) BPI->eraseBlock(BB); return true; } CurrentBB = CurrentPred; CurrentPred = CurrentBB->getSinglePredecessor(); } return false; } /// Return true if Op is an instruction defined in the given block. static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) { if (Instruction *OpInst = dyn_cast(Op)) if (OpInst->getParent() == BB) return true; return false; } /// simplifyPartiallyRedundantLoad - If LoadI is an obviously partially /// redundant load instruction, eliminate it by replacing it with a PHI node. /// This is an important optimization that encourages jump threading, and needs /// to be run interlaced with other jump threading tasks. bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // Don't hack volatile and ordered loads. if (!LoadI->isUnordered()) return false; // If the load is defined in a block with exactly one predecessor, it can't be // partially redundant. BasicBlock *LoadBB = LoadI->getParent(); if (LoadBB->getSinglePredecessor()) return false; // If the load is defined in an EH pad, it can't be partially redundant, // because the edges between the invoke and the EH pad cannot have other // instructions between them. if (LoadBB->isEHPad()) return false; Value *LoadedPtr = LoadI->getOperand(0); // If the loaded operand is defined in the LoadBB and its not a phi, // it can't be available in predecessors. if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa(LoadedPtr)) return false; // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. BasicBlock::iterator BBIt(LoadI); bool IsLoadCSE; if (Value *AvailableVal = FindAvailableLoadedValue( LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. if (IsLoadCSE) { LoadInst *NLoadI = cast(AvailableVal); combineMetadataForCSE(NLoadI, LoadI, false); + LVI->forgetValue(NLoadI); }; // If the returned value is the load itself, replace with poison. This can // only happen in dead loops. if (AvailableVal == LoadI) AvailableVal = PoisonValue::get(LoadI->getType()); if (AvailableVal->getType() != LoadI->getType()) AvailableVal = CastInst::CreateBitOrPointerCast( AvailableVal, LoadI->getType(), "", LoadI); LoadI->replaceAllUsesWith(AvailableVal); LoadI->eraseFromParent(); return true; } // Otherwise, if we scanned the whole block and got to the top of the block, // we know the block is locally transparent to the load. If not, something // might clobber its value. if (BBIt != LoadBB->begin()) return false; // If all of the loads and stores that feed the value have the same AA tags, // then we can propagate them onto any newly inserted loads. AAMDNodes AATags = LoadI->getAAMetadata(); SmallPtrSet PredsScanned; using AvailablePredsTy = SmallVector, 8>; AvailablePredsTy AvailablePreds; BasicBlock *OneUnavailablePred = nullptr; SmallVector CSELoads; // If we got here, the loaded value is transparent through to the start of the // block. Check to see if it is available in any of the predecessor blocks. for (BasicBlock *PredBB : predecessors(LoadBB)) { // If we already scanned this predecessor, skip it. if (!PredsScanned.insert(PredBB).second) continue; BBIt = PredBB->end(); unsigned NumScanedInst = 0; Value *PredAvailable = nullptr; // NOTE: We don't CSE load that is volatile or anything stronger than // unordered, that should have been checked when we entered the function. assert(LoadI->isUnordered() && "Attempting to CSE volatile or atomic loads"); // If this is a load on a phi pointer, phi-translate it and search // for available load/store to the pointer in predecessors. Type *AccessTy = LoadI->getType(); const auto &DL = LoadI->getModule()->getDataLayout(); MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB), LocationSize::precise(DL.getTypeStoreSize(AccessTy)), AATags); PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst); // If PredBB has a single predecessor, continue scanning through the // single predecessor. BasicBlock *SinglePredBB = PredBB; while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() && NumScanedInst < DefMaxInstsToScan) { SinglePredBB = SinglePredBB->getSinglePredecessor(); if (SinglePredBB) { BBIt = SinglePredBB->end(); PredAvailable = findAvailablePtrLoadStore( Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt, (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, &NumScanedInst); } } if (!PredAvailable) { OneUnavailablePred = PredBB; continue; } if (IsLoadCSE) CSELoads.push_back(cast(PredAvailable)); // If so, this load is partially redundant. Remember this info so that we // can create a PHI node. AvailablePreds.emplace_back(PredBB, PredAvailable); } // If the loaded value isn't available in any predecessor, it isn't partially // redundant. if (AvailablePreds.empty()) return false; // Okay, the loaded value is available in at least one (and maybe all!) // predecessors. If the value is unavailable in more than one unique // predecessor, we want to insert a merge block for those common predecessors. // This ensures that we only have to insert one reload, thus not increasing // code size. BasicBlock *UnavailablePred = nullptr; // If the value is unavailable in one of predecessors, we will end up // inserting a new instruction into them. It is only valid if all the // instructions before LoadI are guaranteed to pass execution to its // successor, or if LoadI is safe to speculate. // TODO: If this logic becomes more complex, and we will perform PRE insertion // farther than to a predecessor, we need to reuse the code from GVN's PRE. // It requires domination tree analysis, so for this simple case it is an // overkill. if (PredsScanned.size() != AvailablePreds.size() && !isSafeToSpeculativelyExecute(LoadI)) for (auto I = LoadBB->begin(); &*I != LoadI; ++I) if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) return false; // If there is exactly one predecessor where the value is unavailable, the // already computed 'OneUnavailablePred' block is it. If it ends in an // unconditional branch, we know that it isn't a critical edge. if (PredsScanned.size() == AvailablePreds.size()+1 && OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { UnavailablePred = OneUnavailablePred; } else if (PredsScanned.size() != AvailablePreds.size()) { // Otherwise, we had multiple unavailable predecessors or we had a critical // edge from the one. SmallVector PredsToSplit; SmallPtrSet AvailablePredSet; for (const auto &AvailablePred : AvailablePreds) AvailablePredSet.insert(AvailablePred.first); // Add all the unavailable predecessors to the PredsToSplit list. for (BasicBlock *P : predecessors(LoadBB)) { // If the predecessor is an indirect goto, we can't split the edge. if (isa(P->getTerminator())) return false; if (!AvailablePredSet.count(P)) PredsToSplit.push_back(P); } // Split them out to their own block. UnavailablePred = splitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be // exactly one where it isn't available. Insert a load on that edge and add // it to the AvailablePreds list. if (UnavailablePred) { assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && "Can't handle critical edge here!"); LoadInst *NewVal = new LoadInst( LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), LoadI->getName() + ".pr", false, LoadI->getAlign(), LoadI->getOrdering(), LoadI->getSyncScopeID(), UnavailablePred->getTerminator()); NewVal->setDebugLoc(LoadI->getDebugLoc()); if (AATags) NewVal->setAAMetadata(AATags); AvailablePreds.emplace_back(UnavailablePred, NewVal); } // Now we know that each predecessor of this block has a value in // AvailablePreds, sort them for efficient access as we're walking the preds. array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); // Create a PHI node at the start of the block for the PRE'd load value. pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "", &LoadBB->front()); PN->takeName(LoadI); PN->setDebugLoc(LoadI->getDebugLoc()); // Insert new entries into the PHI for each predecessor. A single block may // have multiple entries here. for (pred_iterator PI = PB; PI != PE; ++PI) { BasicBlock *P = *PI; AvailablePredsTy::iterator I = llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr)); assert(I != AvailablePreds.end() && I->first == P && "Didn't find entry for predecessor!"); // If we have an available predecessor but it requires casting, insert the // cast in the predecessor and use the cast. Note that we have to update the // AvailablePreds vector as we go so that all of the PHI entries for this // predecessor use the same bitcast. Value *&PredV = I->second; if (PredV->getType() != LoadI->getType()) PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "", P->getTerminator()); PN->addIncoming(PredV, I->first); } for (LoadInst *PredLoadI : CSELoads) { combineMetadataForCSE(PredLoadI, LoadI, true); + LVI->forgetValue(PredLoadI); } LoadI->replaceAllUsesWith(PN); LoadI->eraseFromParent(); return true; } /// findMostPopularDest - The specified list contains multiple possible /// threadable destinations. Pick the one that occurs the most frequently in /// the list. static BasicBlock * findMostPopularDest(BasicBlock *BB, const SmallVectorImpl> &PredToDestList) { assert(!PredToDestList.empty()); // Determine popularity. If there are multiple possible destinations, we // explicitly choose to ignore 'undef' destinations. We prefer to thread // blocks with known and real destinations to threading undef. We'll handle // them later if interesting. MapVector DestPopularity; // Populate DestPopularity with the successors in the order they appear in the // successor list. This way, we ensure determinism by iterating it in the // same order in std::max_element below. We map nullptr to 0 so that we can // return nullptr when PredToDestList contains nullptr only. DestPopularity[nullptr] = 0; for (auto *SuccBB : successors(BB)) DestPopularity[SuccBB] = 0; for (const auto &PredToDest : PredToDestList) if (PredToDest.second) DestPopularity[PredToDest.second]++; // Find the most popular dest. auto MostPopular = std::max_element( DestPopularity.begin(), DestPopularity.end(), llvm::less_second()); // Okay, we have finally picked the most popular destination. return MostPopular->first; } // Try to evaluate the value of V when the control flows from PredPredBB to // BB->getSinglePredecessor() and then on to BB. Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB, Value *V) { BasicBlock *PredBB = BB->getSinglePredecessor(); assert(PredBB && "Expected a single predecessor"); if (Constant *Cst = dyn_cast(V)) { return Cst; } // Consult LVI if V is not an instruction in BB or PredBB. Instruction *I = dyn_cast(V); if (!I || (I->getParent() != BB && I->getParent() != PredBB)) { return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr); } // Look into a PHI argument. if (PHINode *PHI = dyn_cast(V)) { if (PHI->getParent() == PredBB) return dyn_cast(PHI->getIncomingValueForBlock(PredPredBB)); return nullptr; } // If we have a CmpInst, try to fold it for each incoming edge into PredBB. if (CmpInst *CondCmp = dyn_cast(V)) { if (CondCmp->getParent() == BB) { Constant *Op0 = evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0)); Constant *Op1 = evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1)); if (Op0 && Op1) { return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1); } } return nullptr; } return nullptr; } bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB, ConstantPreference Preference, Instruction *CxtI) { // If threading this would thread across a loop header, don't even try to // thread the edge. if (LoopHeaders.count(BB)) return false; PredValueInfoTy PredValues; if (!computeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI)) { // We don't have known values in predecessors. See if we can thread through // BB and its sole predecessor. return maybethreadThroughTwoBasicBlocks(BB, Cond); } assert(!PredValues.empty() && "computeValueKnownInPredecessors returned true with no values"); LLVM_DEBUG(dbgs() << "IN BB: " << *BB; for (const auto &PredValue : PredValues) { dbgs() << " BB '" << BB->getName() << "': FOUND condition = " << *PredValue.first << " for pred '" << PredValue.second->getName() << "'.\n"; }); // Decide what we want to thread through. Convert our list of known values to // a list of known destinations for each pred. This also discards duplicate // predecessors and keeps track of the undefined inputs (which are represented // as a null dest in the PredToDestList). SmallPtrSet SeenPreds; SmallVector, 16> PredToDestList; BasicBlock *OnlyDest = nullptr; BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; Constant *OnlyVal = nullptr; Constant *MultipleVal = (Constant *)(intptr_t)~0ULL; for (const auto &PredValue : PredValues) { BasicBlock *Pred = PredValue.second; if (!SeenPreds.insert(Pred).second) continue; // Duplicate predecessor entry. Constant *Val = PredValue.first; BasicBlock *DestBB; if (isa(Val)) DestBB = nullptr; else if (BranchInst *BI = dyn_cast(BB->getTerminator())) { assert(isa(Val) && "Expecting a constant integer"); DestBB = BI->getSuccessor(cast(Val)->isZero()); } else if (SwitchInst *SI = dyn_cast(BB->getTerminator())) { assert(isa(Val) && "Expecting a constant integer"); DestBB = SI->findCaseValue(cast(Val))->getCaseSuccessor(); } else { assert(isa(BB->getTerminator()) && "Unexpected terminator"); assert(isa(Val) && "Expecting a constant blockaddress"); DestBB = cast(Val)->getBasicBlock(); } // If we have exactly one destination, remember it for efficiency below. if (PredToDestList.empty()) { OnlyDest = DestBB; OnlyVal = Val; } else { if (OnlyDest != DestBB) OnlyDest = MultipleDestSentinel; // It possible we have same destination, but different value, e.g. default // case in switchinst. if (Val != OnlyVal) OnlyVal = MultipleVal; } // If the predecessor ends with an indirect goto, we can't change its // destination. if (isa(Pred->getTerminator())) continue; PredToDestList.emplace_back(Pred, DestBB); } // If all edges were unthreadable, we fail. if (PredToDestList.empty()) return false; // If all the predecessors go to a single known successor, we want to fold, // not thread. By doing so, we do not need to duplicate the current block and // also miss potential opportunities in case we dont/cant duplicate. if (OnlyDest && OnlyDest != MultipleDestSentinel) { if (BB->hasNPredecessors(PredToDestList.size())) { bool SeenFirstBranchToOnlyDest = false; std::vector Updates; Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1); for (BasicBlock *SuccBB : successors(BB)) { if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) { SeenFirstBranchToOnlyDest = true; // Don't modify the first branch. } else { SuccBB->removePredecessor(BB, true); // This is unreachable successor. Updates.push_back({DominatorTree::Delete, BB, SuccBB}); } } // Finally update the terminator. Instruction *Term = BB->getTerminator(); BranchInst::Create(OnlyDest, Term); ++NumFolds; Term->eraseFromParent(); DTU->applyUpdatesPermissive(Updates); if (auto *BPI = getBPI()) BPI->eraseBlock(BB); // If the condition is now dead due to the removal of the old terminator, // erase it. if (auto *CondInst = dyn_cast(Cond)) { if (CondInst->use_empty() && !CondInst->mayHaveSideEffects()) CondInst->eraseFromParent(); // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This // is because we use the guards/assume to reason about the `Cond` value // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. else if (OnlyVal && OnlyVal != MultipleVal) replaceFoldableUses(CondInst, OnlyVal, BB); } return true; } } // Determine which is the most common successor. If we have many inputs and // this block is a switch, we want to start by threading the batch that goes // to the most popular destination first. If we only know about one // threadable destination (the common case) we can avoid this. BasicBlock *MostPopularDest = OnlyDest; if (MostPopularDest == MultipleDestSentinel) { // Remove any loop headers from the Dest list, threadEdge conservatively // won't process them, but we might have other destination that are eligible // and we still want to process. erase_if(PredToDestList, [&](const std::pair &PredToDest) { return LoopHeaders.contains(PredToDest.second); }); if (PredToDestList.empty()) return false; MostPopularDest = findMostPopularDest(BB, PredToDestList); } // Now that we know what the most popular destination is, factor all // predecessors that will jump to it into a single predecessor. SmallVector PredsToFactor; for (const auto &PredToDest : PredToDestList) if (PredToDest.second == MostPopularDest) { BasicBlock *Pred = PredToDest.first; // This predecessor may be a switch or something else that has multiple // edges to the block. Factor each of these edges by listing them // according to # occurrences in PredsToFactor. for (BasicBlock *Succ : successors(Pred)) if (Succ == BB) PredsToFactor.push_back(Pred); } // If the threadable edges are branching on an undefined value, we get to pick // the destination that these predecessors should get to. if (!MostPopularDest) MostPopularDest = BB->getTerminator()-> getSuccessor(getBestDestForJumpOnUndef(BB)); // Ok, try to thread it! return tryThreadEdge(BB, PredsToFactor, MostPopularDest); } /// processBranchOnPHI - We have an otherwise unthreadable conditional branch on /// a PHI node (or freeze PHI) in the current block. See if there are any /// simplifications we can do based on inputs to the phi node. bool JumpThreadingPass::processBranchOnPHI(PHINode *PN) { BasicBlock *BB = PN->getParent(); // TODO: We could make use of this to do it once for blocks with common PHI // values. SmallVector PredBBs; PredBBs.resize(1); // If any of the predecessor blocks end in an unconditional branch, we can // *duplicate* the conditional branch into that block in order to further // encourage jump threading and to eliminate cases where we have branch on a // phi of an icmp (branch on icmp is much better). // This is still beneficial when a frozen phi is used as the branch condition // because it allows CodeGenPrepare to further canonicalize br(freeze(icmp)) // to br(icmp(freeze ...)). for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = PN->getIncomingBlock(i); if (BranchInst *PredBr = dyn_cast(PredBB->getTerminator())) if (PredBr->isUnconditional()) { PredBBs[0] = PredBB; // Try to duplicate BB into PredBB. if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs)) return true; } } return false; } /// processBranchOnXOR - We have an otherwise unthreadable conditional branch on /// a xor instruction in the current block. See if there are any /// simplifications we can do based on inputs to the xor. bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) { BasicBlock *BB = BO->getParent(); // If either the LHS or RHS of the xor is a constant, don't do this // optimization. if (isa(BO->getOperand(0)) || isa(BO->getOperand(1))) return false; // If the first instruction in BB isn't a phi, we won't be able to infer // anything special about any particular predecessor. if (!isa(BB->front())) return false; // If this BB is a landing pad, we won't be able to split the edge into it. if (BB->isEHPad()) return false; // If we have a xor as the branch input to this block, and we know that the // LHS or RHS of the xor in any predecessor is true/false, then we can clone // the condition into the predecessor and fix that value to true, saving some // logical ops on that path and encouraging other paths to simplify. // // This copies something like this: // // BB: // %X = phi i1 [1], [%X'] // %Y = icmp eq i32 %A, %B // %Z = xor i1 %X, %Y // br i1 %Z, ... // // Into: // BB': // %Y = icmp ne i32 %A, %B // br i1 %Y, ... PredValueInfoTy XorOpValues; bool isLHS = true; if (!computeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues, WantInteger, BO)) { assert(XorOpValues.empty()); if (!computeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues, WantInteger, BO)) return false; isLHS = false; } assert(!XorOpValues.empty() && "computeValueKnownInPredecessors returned true with no values"); // Scan the information to see which is most popular: true or false. The // predecessors can be of the set true, false, or undef. unsigned NumTrue = 0, NumFalse = 0; for (const auto &XorOpValue : XorOpValues) { if (isa(XorOpValue.first)) // Ignore undefs for the count. continue; if (cast(XorOpValue.first)->isZero()) ++NumFalse; else ++NumTrue; } // Determine which value to split on, true, false, or undef if neither. ConstantInt *SplitVal = nullptr; if (NumTrue > NumFalse) SplitVal = ConstantInt::getTrue(BB->getContext()); else if (NumTrue != 0 || NumFalse != 0) SplitVal = ConstantInt::getFalse(BB->getContext()); // Collect all of the blocks that this can be folded into so that we can // factor this once and clone it once. SmallVector BlocksToFoldInto; for (const auto &XorOpValue : XorOpValues) { if (XorOpValue.first != SplitVal && !isa(XorOpValue.first)) continue; BlocksToFoldInto.push_back(XorOpValue.second); } // If we inferred a value for all of the predecessors, then duplication won't // help us. However, we can just replace the LHS or RHS with the constant. if (BlocksToFoldInto.size() == cast(BB->front()).getNumIncomingValues()) { if (!SplitVal) { // If all preds provide undef, just nuke the xor, because it is undef too. BO->replaceAllUsesWith(UndefValue::get(BO->getType())); BO->eraseFromParent(); } else if (SplitVal->isZero() && BO != BO->getOperand(isLHS)) { // If all preds provide 0, replace the xor with the other input. BO->replaceAllUsesWith(BO->getOperand(isLHS)); BO->eraseFromParent(); } else { // If all preds provide 1, set the computed value to 1. BO->setOperand(!isLHS, SplitVal); } return true; } // If any of predecessors end with an indirect goto, we can't change its // destination. if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { return isa(Pred->getTerminator()); })) return false; // Try to duplicate BB into PredBB. return duplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); } /// addPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new /// predecessor to the PHIBB block. If it has PHI nodes, add entries for /// NewPred using the entries from OldPred (suitably mapped). static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, BasicBlock *OldPred, BasicBlock *NewPred, DenseMap &ValueMap) { for (PHINode &PN : PHIBB->phis()) { // Ok, we have a PHI node. Figure out what the incoming value was for the // DestBlock. Value *IV = PN.getIncomingValueForBlock(OldPred); // Remap the value if necessary. if (Instruction *Inst = dyn_cast(IV)) { DenseMap::iterator I = ValueMap.find(Inst); if (I != ValueMap.end()) IV = I->second; } PN.addIncoming(IV, NewPred); } } /// Merge basic block BB into its sole predecessor if possible. bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) { BasicBlock *SinglePred = BB->getSinglePredecessor(); if (!SinglePred) return false; const Instruction *TI = SinglePred->getTerminator(); if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 || SinglePred == BB || hasAddressTakenAndUsed(BB)) return false; // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) LoopHeaders.insert(BB); LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB, DTU.get()); // Now that BB is merged into SinglePred (i.e. SinglePred code followed by // BB code within one basic block `BB`), we need to invalidate the LVI // information associated with BB, because the LVI information need not be // true for all of BB after the merge. For example, // Before the merge, LVI info and code is as follows: // SinglePred: // %y = use of %p // call @exit() // need not transfer execution to successor. // assume(%p) // from this point on %p is true // br label %BB // BB: // %x = use of %p // br label exit // // Note that this LVI info for blocks BB and SinglPred is correct for %p // (info2 and info1 respectively). After the merge and the deletion of the // LVI info1 for SinglePred. We have the following code: // BB: // %y = use of %p // call @exit() // assume(%p) // %x = use of %p <-- LVI info2 is correct from here onwards. // br label exit // LVI info2 for BB is incorrect at the beginning of BB. // Invalidate LVI information for BB if the LVI is not provably true for // all of BB. if (!isGuaranteedToTransferExecutionToSuccessor(BB)) LVI->eraseBlock(BB); return true; } /// Update the SSA form. NewBB contains instructions that are copied from BB. /// ValueMapping maps old values in BB to new ones in NewBB. void JumpThreadingPass::updateSSA( BasicBlock *BB, BasicBlock *NewBB, DenseMap &ValueMapping) { // If there were values defined in BB that are used outside the block, then we // now have to update all uses of the value to use either the original value, // the cloned value, or some PHI derived value. This can require arbitrary // PHI insertion, of which we are prepared to do, clean these up now. SSAUpdater SSAUpdate; SmallVector UsesToRename; SmallVector DbgValues; for (Instruction &I : *BB) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. for (Use &U : I.uses()) { Instruction *User = cast(U.getUser()); if (PHINode *UserPN = dyn_cast(User)) { if (UserPN->getIncomingBlock(U) == BB) continue; } else if (User->getParent() == BB) continue; UsesToRename.push_back(&U); } // Find debug values outside of the block findDbgValues(DbgValues, &I); DbgValues.erase(remove_if(DbgValues, [&](const DbgValueInst *DbgVal) { return DbgVal->getParent() == BB; }), DbgValues.end()); // If there are no uses outside the block, we're done with this instruction. if (UsesToRename.empty() && DbgValues.empty()) continue; LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); // We found a use of I outside of BB. Rename all uses of I that are outside // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I.getType(), I.getName()); SSAUpdate.AddAvailableValue(BB, &I); SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); if (!DbgValues.empty()) { SSAUpdate.UpdateDebugValues(&I, DbgValues); DbgValues.clear(); } LLVM_DEBUG(dbgs() << "\n"); } } /// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone /// arguments that come from PredBB. Return the map from the variables in the /// source basic block to the variables in the newly created basic block. DenseMap JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI, BasicBlock::iterator BE, BasicBlock *NewBB, BasicBlock *PredBB) { // We are going to have to map operands from the source basic block to the new // copy of the block 'NewBB'. If there are PHI nodes in the source basic // block, evaluate them to account for entry from PredBB. DenseMap ValueMapping; // Retargets llvm.dbg.value to any renamed variables. auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool { auto DbgInstruction = dyn_cast(NewInst); if (!DbgInstruction) return false; SmallSet, 16> OperandsToRemap; for (auto DbgOperand : DbgInstruction->location_ops()) { auto DbgOperandInstruction = dyn_cast(DbgOperand); if (!DbgOperandInstruction) continue; auto I = ValueMapping.find(DbgOperandInstruction); if (I != ValueMapping.end()) { OperandsToRemap.insert( std::pair(DbgOperand, I->second)); } } for (auto &[OldOp, MappedOp] : OperandsToRemap) DbgInstruction->replaceVariableLocationOp(OldOp, MappedOp); return true; }; // Clone the phi nodes of the source basic block into NewBB. The resulting // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater // might need to rewrite the operand of the cloned phi. for (; PHINode *PN = dyn_cast(BI); ++BI) { PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); ValueMapping[PN] = NewPN; } // Clone noalias scope declarations in the threaded block. When threading a // loop exit, we would otherwise end up with two idential scope declarations // visible at the same time. SmallVector NoAliasScopes; DenseMap ClonedScopes; LLVMContext &Context = PredBB->getContext(); identifyNoAliasScopesToClone(BI, BE, NoAliasScopes); cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context); // Clone the non-phi instructions of the source basic block into NewBB, // keeping track of the mapping and using it to remap operands in the cloned // instructions. for (; BI != BE; ++BI) { Instruction *New = BI->clone(); New->setName(BI->getName()); New->insertInto(NewBB, NewBB->end()); ValueMapping[&*BI] = New; adaptNoAliasScopes(New, ClonedScopes, Context); if (RetargetDbgValueIfPossible(New)) continue; // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast(New->getOperand(i))) { DenseMap::iterator I = ValueMapping.find(Inst); if (I != ValueMapping.end()) New->setOperand(i, I->second); } } return ValueMapping; } /// Attempt to thread through two successive basic blocks. bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB, Value *Cond) { // Consider: // // PredBB: // %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ] // %tobool = icmp eq i32 %cond, 0 // br i1 %tobool, label %BB, label ... // // BB: // %cmp = icmp eq i32* %var, null // br i1 %cmp, label ..., label ... // // We don't know the value of %var at BB even if we know which incoming edge // we take to BB. However, once we duplicate PredBB for each of its incoming // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of // PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB. // Require that BB end with a Branch for simplicity. BranchInst *CondBr = dyn_cast(BB->getTerminator()); if (!CondBr) return false; // BB must have exactly one predecessor. BasicBlock *PredBB = BB->getSinglePredecessor(); if (!PredBB) return false; // Require that PredBB end with a conditional Branch. If PredBB ends with an // unconditional branch, we should be merging PredBB and BB instead. For // simplicity, we don't deal with a switch. BranchInst *PredBBBranch = dyn_cast(PredBB->getTerminator()); if (!PredBBBranch || PredBBBranch->isUnconditional()) return false; // If PredBB has exactly one incoming edge, we don't gain anything by copying // PredBB. if (PredBB->getSinglePredecessor()) return false; // Don't thread through PredBB if it contains a successor edge to itself, in // which case we would infinite loop. Suppose we are threading an edge from // PredPredBB through PredBB and BB to SuccBB with PredBB containing a // successor edge to itself. If we allowed jump threading in this case, we // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread. Since // PredBB.thread has a successor edge to PredBB, we would immediately come up // with another jump threading opportunity from PredBB.thread through PredBB // and BB to SuccBB. This jump threading would repeatedly occur. That is, we // would keep peeling one iteration from PredBB. if (llvm::is_contained(successors(PredBB), PredBB)) return false; // Don't thread across a loop header. if (LoopHeaders.count(PredBB)) return false; // Avoid complication with duplicating EH pads. if (PredBB->isEHPad()) return false; // Find a predecessor that we can thread. For simplicity, we only consider a // successor edge out of BB to which we thread exactly one incoming edge into // PredBB. unsigned ZeroCount = 0; unsigned OneCount = 0; BasicBlock *ZeroPred = nullptr; BasicBlock *OnePred = nullptr; for (BasicBlock *P : predecessors(PredBB)) { // If PredPred ends with IndirectBrInst, we can't handle it. if (isa(P->getTerminator())) continue; if (ConstantInt *CI = dyn_cast_or_null( evaluateOnPredecessorEdge(BB, P, Cond))) { if (CI->isZero()) { ZeroCount++; ZeroPred = P; } else if (CI->isOne()) { OneCount++; OnePred = P; } } } // Disregard complicated cases where we have to thread multiple edges. BasicBlock *PredPredBB; if (ZeroCount == 1) { PredPredBB = ZeroPred; } else if (OneCount == 1) { PredPredBB = OnePred; } else { return false; } BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred); // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() << "' - would thread to self!\n"); return false; } // If threading this would thread across a loop header, don't thread the edge. // See the comments above findLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { LLVM_DEBUG({ bool BBIsHeader = LoopHeaders.count(BB); bool SuccIsHeader = LoopHeaders.count(SuccBB); dbgs() << " Not threading across " << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName() << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '") << SuccBB->getName() << "' - it might create an irreducible loop!\n"; }); return false; } // Compute the cost of duplicating BB and PredBB. unsigned BBCost = getJumpThreadDuplicationCost( TTI, BB, BB->getTerminator(), BBDupThreshold); unsigned PredBBCost = getJumpThreadDuplicationCost( TTI, PredBB, PredBB->getTerminator(), BBDupThreshold); // Give up if costs are too high. We need to check BBCost and PredBBCost // individually before checking their sum because getJumpThreadDuplicationCost // return (unsigned)~0 for those basic blocks that cannot be duplicated. if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold || BBCost + PredBBCost > BBDupThreshold) { LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << PredBBCost << " for PredBB, " << BBCost << "for BB\n"); return false; } // Now we are ready to duplicate PredBB. threadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB); return true; } void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB, BasicBlock *PredBB, BasicBlock *BB, BasicBlock *SuccBB) { LLVM_DEBUG(dbgs() << " Threading through '" << PredBB->getName() << "' and '" << BB->getName() << "'\n"); // Build BPI/BFI before any changes are made to IR. bool HasProfile = doesBlockHaveProfileData(BB); auto *BFI = getOrCreateBFI(HasProfile); auto *BPI = getOrCreateBPI(BFI != nullptr); BranchInst *CondBr = cast(BB->getTerminator()); BranchInst *PredBBBranch = cast(PredBB->getTerminator()); BasicBlock *NewBB = BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread", PredBB->getParent(), PredBB); NewBB->moveAfter(PredBB); // Set the block frequency of NewBB. if (BFI) { assert(BPI && "It's expected BPI to exist along with BFI"); auto NewBBFreq = BFI->getBlockFreq(PredPredBB) * BPI->getEdgeProbability(PredPredBB, PredBB); BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } // We are going to have to map operands from the original BB block to the new // copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them // to account for entry from PredPredBB. DenseMap ValueMapping = cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB); // Copy the edge probabilities from PredBB to NewBB. if (BPI) BPI->copyEdgeProbabilities(PredBB, NewBB); // Update the terminator of PredPredBB to jump to NewBB instead of PredBB. // This eliminates predecessors from PredPredBB, which requires us to simplify // any PHI nodes in PredBB. Instruction *PredPredTerm = PredPredBB->getTerminator(); for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i) if (PredPredTerm->getSuccessor(i) == PredBB) { PredBB->removePredecessor(PredPredBB, true); PredPredTerm->setSuccessor(i, NewBB); } addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB, ValueMapping); addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB, ValueMapping); DTU->applyUpdatesPermissive( {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)}, {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)}, {DominatorTree::Insert, PredPredBB, NewBB}, {DominatorTree::Delete, PredPredBB, PredBB}}); updateSSA(PredBB, NewBB, ValueMapping); // Clean up things like PHI nodes with single operands, dead instructions, // etc. SimplifyInstructionsInBlock(NewBB, TLI); SimplifyInstructionsInBlock(PredBB, TLI); SmallVector PredsToFactor; PredsToFactor.push_back(NewBB); threadEdge(BB, PredsToFactor, SuccBB); } /// tryThreadEdge - Thread an edge if it's safe and profitable to do so. bool JumpThreadingPass::tryThreadEdge( BasicBlock *BB, const SmallVectorImpl &PredBBs, BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() << "' - would thread to self!\n"); return false; } // If threading this would thread across a loop header, don't thread the edge. // See the comments above findLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { LLVM_DEBUG({ bool BBIsHeader = LoopHeaders.count(BB); bool SuccIsHeader = LoopHeaders.count(SuccBB); dbgs() << " Not threading across " << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName() << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '") << SuccBB->getName() << "' - it might create an irreducible loop!\n"; }); return false; } unsigned JumpThreadCost = getJumpThreadDuplicationCost( TTI, BB, BB->getTerminator(), BBDupThreshold); if (JumpThreadCost > BBDupThreshold) { LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); return false; } threadEdge(BB, PredBBs, SuccBB); return true; } /// threadEdge - We have decided that it is safe and profitable to factor the /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB /// across BB. Transform the IR to reflect this change. void JumpThreadingPass::threadEdge(BasicBlock *BB, const SmallVectorImpl &PredBBs, BasicBlock *SuccBB) { assert(SuccBB != BB && "Don't create an infinite loop"); assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) && "Don't thread across loop headers"); // Build BPI/BFI before any changes are made to IR. bool HasProfile = doesBlockHaveProfileData(BB); auto *BFI = getOrCreateBFI(HasProfile); auto *BPI = getOrCreateBPI(BFI != nullptr); // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm"); } // And finally, do it! LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" << SuccBB->getName() << ", across block:\n " << *BB << "\n"); LVI->threadEdge(PredBB, BB, SuccBB); BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+".thread", BB->getParent(), BB); NewBB->moveAfter(PredBB); // Set the block frequency of NewBB. if (BFI) { assert(BPI && "It's expected BPI to exist along with BFI"); auto NewBBFreq = BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } // Copy all the instructions from BB to NewBB except the terminator. DenseMap ValueMapping = cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the // PHI nodes for NewBB now. addPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); // Update the terminator of PredBB to jump to NewBB instead of BB. This // eliminates predecessors from BB, which requires us to simplify any PHI // nodes in BB. Instruction *PredTerm = PredBB->getTerminator(); for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) if (PredTerm->getSuccessor(i) == BB) { BB->removePredecessor(PredBB, true); PredTerm->setSuccessor(i, NewBB); } // Enqueue required DT updates. DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB}, {DominatorTree::Insert, PredBB, NewBB}, {DominatorTree::Delete, PredBB, BB}}); updateSSA(BB, NewBB, ValueMapping); // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. SimplifyInstructionsInBlock(NewBB, TLI); // Update the edge weight from BB to SuccBB, which should be less than before. updateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB, BFI, BPI, HasProfile); // Threaded an edge! ++NumThreads; } /// Create a new basic block that will be the predecessor of BB and successor of /// all blocks in Preds. When profile data is available, update the frequency of /// this new block. BasicBlock *JumpThreadingPass::splitBlockPreds(BasicBlock *BB, ArrayRef Preds, const char *Suffix) { SmallVector NewBBs; // Collect the frequencies of all predecessors of BB, which will be used to // update the edge weight of the result of splitting predecessors. DenseMap FreqMap; auto *BFI = getBFI(); if (BFI) { auto *BPI = getOrCreateBPI(true); for (auto *Pred : Preds) FreqMap.insert(std::make_pair( Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB))); } // In the case when BB is a LandingPad block we create 2 new predecessors // instead of just one. if (BB->isLandingPad()) { std::string NewName = std::string(Suffix) + ".split-lp"; SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs); } else { NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix)); } std::vector Updates; Updates.reserve((2 * Preds.size()) + NewBBs.size()); for (auto *NewBB : NewBBs) { BlockFrequency NewBBFreq(0); Updates.push_back({DominatorTree::Insert, NewBB, BB}); for (auto *Pred : predecessors(NewBB)) { Updates.push_back({DominatorTree::Delete, Pred, BB}); Updates.push_back({DominatorTree::Insert, Pred, NewBB}); if (BFI) // Update frequencies between Pred -> NewBB. NewBBFreq += FreqMap.lookup(Pred); } if (BFI) // Apply the summed frequency to NewBB. BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } DTU->applyUpdatesPermissive(Updates); return NewBBs[0]; } bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) { const Instruction *TI = BB->getTerminator(); if (!TI || TI->getNumSuccessors() < 2) return false; return hasValidBranchWeightMD(*TI); } /// Update the block frequency of BB and branch weight and the metadata on the /// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - /// Freq(PredBB->BB) / Freq(BB->SuccBB). void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, BasicBlock *NewBB, BasicBlock *SuccBB, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, bool HasProfile) { assert(((BFI && BPI) || (!BFI && !BFI)) && "Both BFI & BPI should either be set or unset"); if (!BFI) { assert(!HasProfile && "It's expected to have BFI/BPI when profile info exists"); return; } // As the edge from PredBB to BB is deleted, we have to update the block // frequency of BB. auto BBOrigFreq = BFI->getBlockFreq(BB); auto NewBBFreq = BFI->getBlockFreq(NewBB); auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); auto BBNewFreq = BBOrigFreq - NewBBFreq; BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); // Collect updated outgoing edges' frequencies from BB and use them to update // edge probabilities. SmallVector BBSuccFreq; for (BasicBlock *Succ : successors(BB)) { auto SuccFreq = (Succ == SuccBB) ? BB2SuccBBFreq - NewBBFreq : BBOrigFreq * BPI->getEdgeProbability(BB, Succ); BBSuccFreq.push_back(SuccFreq.getFrequency()); } uint64_t MaxBBSuccFreq = *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); SmallVector BBSuccProbs; if (MaxBBSuccFreq == 0) BBSuccProbs.assign(BBSuccFreq.size(), {1, static_cast(BBSuccFreq.size())}); else { for (uint64_t Freq : BBSuccFreq) BBSuccProbs.push_back( BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); // Normalize edge probabilities so that they sum up to one. BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), BBSuccProbs.end()); } // Update edge probabilities in BPI. BPI->setEdgeProbability(BB, BBSuccProbs); // Update the profile metadata as well. // // Don't do this if the profile of the transformed blocks was statically // estimated. (This could occur despite the function having an entry // frequency in completely cold parts of the CFG.) // // In this case we don't want to suggest to subsequent passes that the // calculated weights are fully consistent. Consider this graph: // // check_1 // 50% / | // eq_1 | 50% // \ | // check_2 // 50% / | // eq_2 | 50% // \ | // check_3 // 50% / | // eq_3 | 50% // \ | // // Assuming the blocks check_* all compare the same value against 1, 2 and 3, // the overall probabilities are inconsistent; the total probability that the // value is either 1, 2 or 3 is 150%. // // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3 // becomes 0%. This is even worse if the edge whose probability becomes 0% is // the loop exit edge. Then based solely on static estimation we would assume // the loop was extremely hot. // // FIXME this locally as well so that BPI and BFI are consistent as well. We // shouldn't make edges extremely likely or unlikely based solely on static // estimation. if (BBSuccProbs.size() >= 2 && HasProfile) { SmallVector Weights; for (auto Prob : BBSuccProbs) Weights.push_back(Prob.getNumerator()); auto TI = BB->getTerminator(); TI->setMetadata( LLVMContext::MD_prof, MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); } } /// duplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch /// to BB which contains an i1 PHI node and a conditional branch on that PHI. /// If we can duplicate the contents of BB up into PredBB do so now, this /// improves the odds that the branch will be on an analyzable instruction like /// a compare. bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( BasicBlock *BB, const SmallVectorImpl &PredBBs) { assert(!PredBBs.empty() && "Can't handle an empty set"); // If BB is a loop header, then duplicating this block outside the loop would // cause us to transform this into an irreducible loop, don't do this. // See the comments above findLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB)) { LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName() << "' into predecessor block '" << PredBBs[0]->getName() << "' - it might create an irreducible loop!\n"); return false; } unsigned DuplicationCost = getJumpThreadDuplicationCost( TTI, BB, BB->getTerminator(), BBDupThreshold); if (DuplicationCost > BBDupThreshold) { LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); return false; } // And finally, do it! Start by factoring the predecessors if needed. std::vector Updates; BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm"); } Updates.push_back({DominatorTree::Delete, PredBB, BB}); // Okay, we decided to do this! Clone all the instructions in BB onto the end // of PredBB. LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '" << PredBB->getName() << "' to eliminate branch on phi. Cost: " << DuplicationCost << " block is:" << *BB << "\n"); // Unless PredBB ends with an unconditional branch, split the edge so that we // can just clone the bits from BB into the end of the new PredBB. BranchInst *OldPredBranch = dyn_cast(PredBB->getTerminator()); if (!OldPredBranch || !OldPredBranch->isUnconditional()) { BasicBlock *OldPredBB = PredBB; PredBB = SplitEdge(OldPredBB, BB); Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB}); Updates.push_back({DominatorTree::Insert, PredBB, BB}); Updates.push_back({DominatorTree::Delete, OldPredBB, BB}); OldPredBranch = cast(PredBB->getTerminator()); } // We are going to have to map operands from the original BB block into the // PredBB block. Evaluate PHI nodes in BB. DenseMap ValueMapping; BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); // Clone the non-phi instructions of BB into PredBB, keeping track of the // mapping and using it to remap operands in the cloned instructions. for (; BI != BB->end(); ++BI) { Instruction *New = BI->clone(); New->insertInto(PredBB, OldPredBranch->getIterator()); // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast(New->getOperand(i))) { DenseMap::iterator I = ValueMapping.find(Inst); if (I != ValueMapping.end()) New->setOperand(i, I->second); } // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. if (Value *IV = simplifyInstruction( New, {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) { ValueMapping[&*BI] = IV; if (!New->mayHaveSideEffects()) { New->eraseFromParent(); New = nullptr; } } else { ValueMapping[&*BI] = New; } if (New) { // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); // Update Dominance from simplified New instruction operands. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (BasicBlock *SuccBB = dyn_cast(New->getOperand(i))) Updates.push_back({DominatorTree::Insert, PredBB, SuccBB}); } } // Check to see if the targets of the branch had PHI nodes. If so, we need to // add entries to the PHI nodes for branch from PredBB now. BranchInst *BBBranch = cast(BB->getTerminator()); addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB, ValueMapping); addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, ValueMapping); updateSSA(BB, PredBB, ValueMapping); // PredBB no longer jumps to BB, remove entries in the PHI node for the edge // that we nuked. BB->removePredecessor(PredBB, true); // Remove the unconditional branch at the end of the PredBB block. OldPredBranch->eraseFromParent(); if (auto *BPI = getBPI()) BPI->copyEdgeProbabilities(BB, PredBB); DTU->applyUpdatesPermissive(Updates); ++NumDupes; return true; } // Pred is a predecessor of BB with an unconditional branch to BB. SI is // a Select instruction in Pred. BB has other predecessors and SI is used in // a PHI node in BB. SI has no other use. // A new basic block, NewBB, is created and SI is converted to compare and // conditional branch. SI is erased from parent. void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, SelectInst *SI, PHINode *SIUse, unsigned Idx) { // Expand the select. // // Pred -- // | v // | NewBB // | | // |----- // v // BB BranchInst *PredTerm = cast(Pred->getTerminator()); BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", BB->getParent(), BB); // Move the unconditional branch to NewBB. PredTerm->removeFromParent(); PredTerm->insertInto(NewBB, NewBB->end()); // Create a conditional branch and update PHI nodes. auto *BI = BranchInst::Create(NewBB, BB, SI->getCondition(), Pred); BI->applyMergedLocation(PredTerm->getDebugLoc(), SI->getDebugLoc()); BI->copyMetadata(*SI, {LLVMContext::MD_prof}); SIUse->setIncomingValue(Idx, SI->getFalseValue()); SIUse->addIncoming(SI->getTrueValue(), NewBB); uint64_t TrueWeight = 1; uint64_t FalseWeight = 1; // Copy probabilities from 'SI' to created conditional branch in 'Pred'. if (extractBranchWeights(*SI, TrueWeight, FalseWeight) && (TrueWeight + FalseWeight) != 0) { SmallVector BP; BP.emplace_back(BranchProbability::getBranchProbability( TrueWeight, TrueWeight + FalseWeight)); BP.emplace_back(BranchProbability::getBranchProbability( FalseWeight, TrueWeight + FalseWeight)); // Update BPI if exists. if (auto *BPI = getBPI()) BPI->setEdgeProbability(Pred, BP); } // Set the block frequency of NewBB. if (auto *BFI = getBFI()) { if ((TrueWeight + FalseWeight) == 0) { TrueWeight = 1; FalseWeight = 1; } BranchProbability PredToNewBBProb = BranchProbability::getBranchProbability( TrueWeight, TrueWeight + FalseWeight); auto NewBBFreq = BFI->getBlockFreq(Pred) * PredToNewBBProb; BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } // The select is now dead. SI->eraseFromParent(); DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB}, {DominatorTree::Insert, Pred, NewBB}}); // Update any other PHI nodes in BB. for (BasicBlock::iterator BI = BB->begin(); PHINode *Phi = dyn_cast(BI); ++BI) if (Phi != SIUse) Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB); } bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) { PHINode *CondPHI = dyn_cast(SI->getCondition()); if (!CondPHI || CondPHI->getParent() != BB) return false; for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) { BasicBlock *Pred = CondPHI->getIncomingBlock(I); SelectInst *PredSI = dyn_cast(CondPHI->getIncomingValue(I)); // The second and third condition can be potentially relaxed. Currently // the conditions help to simplify the code and allow us to reuse existing // code, developed for tryToUnfoldSelect(CmpInst *, BasicBlock *) if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse()) continue; BranchInst *PredTerm = dyn_cast(Pred->getTerminator()); if (!PredTerm || !PredTerm->isUnconditional()) continue; unfoldSelectInstr(Pred, BB, PredSI, CondPHI, I); return true; } return false; } /// tryToUnfoldSelect - Look for blocks of the form /// bb1: /// %a = select /// br bb2 /// /// bb2: /// %p = phi [%a, %bb1] ... /// %c = icmp %p /// br i1 %c /// /// And expand the select into a branch structure if one of its arms allows %c /// to be folded. This later enables threading from bb1 over bb2. bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { BranchInst *CondBr = dyn_cast(BB->getTerminator()); PHINode *CondLHS = dyn_cast(CondCmp->getOperand(0)); Constant *CondRHS = cast(CondCmp->getOperand(1)); if (!CondBr || !CondBr->isConditional() || !CondLHS || CondLHS->getParent() != BB) return false; for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) { BasicBlock *Pred = CondLHS->getIncomingBlock(I); SelectInst *SI = dyn_cast(CondLHS->getIncomingValue(I)); // Look if one of the incoming values is a select in the corresponding // predecessor. if (!SI || SI->getParent() != Pred || !SI->hasOneUse()) continue; BranchInst *PredTerm = dyn_cast(Pred->getTerminator()); if (!PredTerm || !PredTerm->isUnconditional()) continue; // Now check if one of the select values would allow us to constant fold the // terminator in BB. We don't do the transform if both sides fold, those // cases will be threaded in any case. LazyValueInfo::Tristate LHSFolds = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), CondRHS, Pred, BB, CondCmp); LazyValueInfo::Tristate RHSFolds = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), CondRHS, Pred, BB, CondCmp); if ((LHSFolds != LazyValueInfo::Unknown || RHSFolds != LazyValueInfo::Unknown) && LHSFolds != RHSFolds) { unfoldSelectInstr(Pred, BB, SI, CondLHS, I); return true; } } return false; } /// tryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the /// same BB in the form /// bb: /// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... /// %s = select %p, trueval, falseval /// /// or /// /// bb: /// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... /// %c = cmp %p, 0 /// %s = select %c, trueval, falseval /// /// And expand the select into a branch structure. This later enables /// jump-threading over bb in this pass. /// /// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold /// select if the associated PHI has at least one constant. If the unfolded /// select is not jump-threaded, it will be folded again in the later /// optimizations. bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) { // This transform would reduce the quality of msan diagnostics. // Disable this transform under MemorySanitizer. if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory)) return false; // If threading this would thread across a loop header, don't thread the edge. // See the comments above findLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB)) return false; for (BasicBlock::iterator BI = BB->begin(); PHINode *PN = dyn_cast(BI); ++BI) { // Look for a Phi having at least one constant incoming value. if (llvm::all_of(PN->incoming_values(), [](Value *V) { return !isa(V); })) continue; auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) { using namespace PatternMatch; // Check if SI is in BB and use V as condition. if (SI->getParent() != BB) return false; Value *Cond = SI->getCondition(); bool IsAndOr = match(SI, m_CombineOr(m_LogicalAnd(), m_LogicalOr())); return Cond && Cond == V && Cond->getType()->isIntegerTy(1) && !IsAndOr; }; SelectInst *SI = nullptr; for (Use &U : PN->uses()) { if (ICmpInst *Cmp = dyn_cast(U.getUser())) { // Look for a ICmp in BB that compares PN with a constant and is the // condition of a Select. if (Cmp->getParent() == BB && Cmp->hasOneUse() && isa(Cmp->getOperand(1 - U.getOperandNo()))) if (SelectInst *SelectI = dyn_cast(Cmp->user_back())) if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) { SI = SelectI; break; } } else if (SelectInst *SelectI = dyn_cast(U.getUser())) { // Look for a Select in BB that uses PN as condition. if (isUnfoldCandidate(SelectI, U.get())) { SI = SelectI; break; } } } if (!SI) continue; // Expand the select. Value *Cond = SI->getCondition(); if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) Cond = new FreezeInst(Cond, "cond.fr", SI); Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false); BasicBlock *SplitBB = SI->getParent(); BasicBlock *NewBB = Term->getParent(); PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); NewPN->addIncoming(SI->getFalseValue(), BB); SI->replaceAllUsesWith(NewPN); SI->eraseFromParent(); // NewBB and SplitBB are newly created blocks which require insertion. std::vector Updates; Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3); Updates.push_back({DominatorTree::Insert, BB, SplitBB}); Updates.push_back({DominatorTree::Insert, BB, NewBB}); Updates.push_back({DominatorTree::Insert, NewBB, SplitBB}); // BB's successors were moved to SplitBB, update DTU accordingly. for (auto *Succ : successors(SplitBB)) { Updates.push_back({DominatorTree::Delete, BB, Succ}); Updates.push_back({DominatorTree::Insert, SplitBB, Succ}); } DTU->applyUpdatesPermissive(Updates); return true; } return false; } /// Try to propagate a guard from the current BB into one of its predecessors /// in case if another branch of execution implies that the condition of this /// guard is always true. Currently we only process the simplest case that /// looks like: /// /// Start: /// %cond = ... /// br i1 %cond, label %T1, label %F1 /// T1: /// br label %Merge /// F1: /// br label %Merge /// Merge: /// %condGuard = ... /// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ] /// /// And cond either implies condGuard or !condGuard. In this case all the /// instructions before the guard can be duplicated in both branches, and the /// guard is then threaded to one of them. bool JumpThreadingPass::processGuards(BasicBlock *BB) { using namespace PatternMatch; // We only want to deal with two predecessors. BasicBlock *Pred1, *Pred2; auto PI = pred_begin(BB), PE = pred_end(BB); if (PI == PE) return false; Pred1 = *PI++; if (PI == PE) return false; Pred2 = *PI++; if (PI != PE) return false; if (Pred1 == Pred2) return false; // Try to thread one of the guards of the block. // TODO: Look up deeper than to immediate predecessor? auto *Parent = Pred1->getSinglePredecessor(); if (!Parent || Parent != Pred2->getSinglePredecessor()) return false; if (auto *BI = dyn_cast(Parent->getTerminator())) for (auto &I : *BB) if (isGuard(&I) && threadGuard(BB, cast(&I), BI)) return true; return false; } /// Try to propagate the guard from BB which is the lower block of a diamond /// to one of its branches, in case if diamond's condition implies guard's /// condition. bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI) { assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?"); assert(BI->isConditional() && "Unconditional branch has 2 successors?"); Value *GuardCond = Guard->getArgOperand(0); Value *BranchCond = BI->getCondition(); BasicBlock *TrueDest = BI->getSuccessor(0); BasicBlock *FalseDest = BI->getSuccessor(1); auto &DL = BB->getModule()->getDataLayout(); bool TrueDestIsSafe = false; bool FalseDestIsSafe = false; // True dest is safe if BranchCond => GuardCond. auto Impl = isImpliedCondition(BranchCond, GuardCond, DL); if (Impl && *Impl) TrueDestIsSafe = true; else { // False dest is safe if !BranchCond => GuardCond. Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false); if (Impl && *Impl) FalseDestIsSafe = true; } if (!TrueDestIsSafe && !FalseDestIsSafe) return false; BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest; BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest; ValueToValueMapTy UnguardedMapping, GuardedMapping; Instruction *AfterGuard = Guard->getNextNode(); unsigned Cost = getJumpThreadDuplicationCost(TTI, BB, AfterGuard, BBDupThreshold); if (Cost > BBDupThreshold) return false; // Duplicate all instructions before the guard and the guard itself to the // branch where implication is not proved. BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween( BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU); assert(GuardedBlock && "Could not create the guarded block?"); // Duplicate all instructions before the guard in the unguarded branch. // Since we have successfully duplicated the guarded block and this block // has fewer instructions, we expect it to succeed. BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween( BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU); assert(UnguardedBlock && "Could not create the unguarded block?"); LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block " << GuardedBlock->getName() << "\n"); // Some instructions before the guard may still have uses. For them, we need // to create Phi nodes merging their copies in both guarded and unguarded // branches. Those instructions that have no uses can be just removed. SmallVector ToRemove; for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI) if (!isa(&*BI)) ToRemove.push_back(&*BI); Instruction *InsertionPoint = &*BB->getFirstInsertionPt(); assert(InsertionPoint && "Empty block?"); // Substitute with Phis & remove. for (auto *Inst : reverse(ToRemove)) { if (!Inst->use_empty()) { PHINode *NewPN = PHINode::Create(Inst->getType(), 2); NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); NewPN->insertBefore(InsertionPoint); Inst->replaceAllUsesWith(NewPN); } Inst->eraseFromParent(); } return true; } PreservedAnalyses JumpThreadingPass::getPreservedAnalysis() const { PreservedAnalyses PA; PA.preserve(); PA.preserve(); // TODO: We would like to preserve BPI/BFI. Enable once all paths update them. // TODO: Would be nice to verify BPI/BFI consistency as well. return PA; } template typename AnalysisT::Result *JumpThreadingPass::runExternalAnalysis() { assert(FAM && "Can't run external analysis without FunctionAnalysisManager"); // If there were no changes since last call to 'runExternalAnalysis' then all // analysis is either up to date or explicitly invalidated. Just go ahead and // run the "external" analysis. if (!ChangedSinceLastAnalysisUpdate) { assert(!DTU->hasPendingUpdates() && "Lost update of 'ChangedSinceLastAnalysisUpdate'?"); // Run the "external" analysis. return &FAM->getResult(*F); } ChangedSinceLastAnalysisUpdate = false; auto PA = getPreservedAnalysis(); // TODO: This shouldn't be needed once 'getPreservedAnalysis' reports BPI/BFI // as preserved. PA.preserve(); PA.preserve(); // Report everything except explicitly preserved as invalid. FAM->invalidate(*F, PA); // Update DT/PDT. DTU->flush(); // Make sure DT/PDT are valid before running "external" analysis. assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Fast)); assert((!DTU->hasPostDomTree() || DTU->getPostDomTree().verify( PostDominatorTree::VerificationLevel::Fast))); // Run the "external" analysis. auto *Result = &FAM->getResult(*F); // Update analysis JumpThreading depends on and not explicitly preserved. TTI = &FAM->getResult(*F); TLI = &FAM->getResult(*F); AA = &FAM->getResult(*F); return Result; } BranchProbabilityInfo *JumpThreadingPass::getBPI() { if (!BPI) { assert(FAM && "Can't create BPI without FunctionAnalysisManager"); BPI = FAM->getCachedResult(*F); } return *BPI; } BlockFrequencyInfo *JumpThreadingPass::getBFI() { if (!BFI) { assert(FAM && "Can't create BFI without FunctionAnalysisManager"); BFI = FAM->getCachedResult(*F); } return *BFI; } // Important note on validity of BPI/BFI. JumpThreading tries to preserve // BPI/BFI as it goes. Thus if cached instance exists it will be updated. // Otherwise, new instance of BPI/BFI is created (up to date by definition). BranchProbabilityInfo *JumpThreadingPass::getOrCreateBPI(bool Force) { auto *Res = getBPI(); if (Res) return Res; if (Force) BPI = runExternalAnalysis(); return *BPI; } BlockFrequencyInfo *JumpThreadingPass::getOrCreateBFI(bool Force) { auto *Res = getBFI(); if (Res) return Res; if (Force) BFI = runExternalAnalysis(); return *BFI; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d7e40e8ef978..b603bbe55dc9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1,10722 +1,10756 @@ //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops // and generates target-independent LLVM-IR. // The vectorizer uses the TargetTransformInfo analysis to estimate the costs // of instructions in order to estimate the profitability of vectorization. // // The loop vectorizer combines consecutive loop iterations into a single // 'wide' iteration. After this transformation the index is incremented // by the SIMD vector width, and not by one. // // This pass has three parts: // 1. The main loop pass that drives the different parts. // 2. LoopVectorizationLegality - A unit that checks for the legality // of the vectorization. // 3. InnerLoopVectorizer - A unit that performs the actual // widening of instructions. // 4. LoopVectorizationCostModel - A unit that checks for the profitability // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. // // There is a development effort going on to migrate loop vectorizer to the // VPlan infrastructure and to introduce outer loop vectorization support (see // docs/Proposal/VectorizationPlan.rst and // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this // purpose, we temporarily introduced the VPlan-native vectorization path: an // alternative vectorization path that is natively implemented on top of the // VPlan infrastructure. See EnableVPlanNativePath for enabling. // //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. // // Variable uniformity checks are inspired by: // Karrenberg, R. and Hack, S. Whole Function Vectorization. // // The interleaved access vectorization is based on the paper: // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved // Data for SIMD // // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of // Vectorizing Compilers. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" #include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/InstructionCost.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include #include #include #include #include #include #include #include #include #include #include #include using namespace llvm; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME #ifndef NDEBUG const char VerboseDebug[] = DEBUG_TYPE "-verbose"; #endif /// @{ /// Metadata attribute names const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; const char LLVMLoopVectorizeFollowupVectorized[] = "llvm.loop.vectorize.followup_vectorized"; const char LLVMLoopVectorizeFollowupEpilogue[] = "llvm.loop.vectorize.followup_epilogue"; /// @} STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops.")); static cl::opt EpilogueVectorizationForceVF( "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops.")); static cl::opt EpilogueVectorizationMinVF( "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization.")); /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. static cl::opt TinyTripCountVectorThreshold( "vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred.")); static cl::opt VectorizeMemoryCheckThreshold( "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body // and predicate the instructions accordingly. If tail-folding fails, there are // different fallback strategies depending on these values: namespace PreferPredicateTy { enum Option { ScalarEpilogue = 0, PredicateElseScalarEpilogue, PredicateOrDontVectorize }; } // namespace PreferPredicateTy static cl::opt PreferPredicateOverEpilogue( "prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails."))); static cl::opt ForceTailFoldingStyle( "force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values( clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN( TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN( TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"))); static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop.")); static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); /// An interleave-group may need masking if it resides in a block that needs /// predication, or in order to mask away gaps. static cl::opt EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); static cl::opt TinyTripCountInterleaveThreshold( "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, cl::desc("We don't interleave loops with a estimated constant trip count " "below this number")); static cl::opt ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); static cl::opt ForceTargetNumVectorRegs( "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); static cl::opt ForceTargetMaxScalarInterleaveFactor( "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops.")); static cl::opt ForceTargetMaxVectorInterleaveFactor( "force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops.")); static cl::opt ForceTargetInstructionCost( "force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing.")); static cl::opt ForceTargetSupportsScalableVectors( "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc( "Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing.")); static cl::opt SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc( "The cost of a loop that is considered 'small' by the interleaver.")); static cl::opt LoopVectorizeWithBlockFrequency( "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); // Runtime interleave loops for load/store throughput. static cl::opt EnableLoadStoreRuntimeInterleave( "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc( "Enable runtime interleaving until load/store ports are saturated")); /// Interleave small loops with scalar reductions. static cl::opt InterleaveSmallLoopScalarReduction( "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, cl::desc("Enable interleaving for loops with small iteration counts that " "contain scalar reductions to expose ILP.")); /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if.")); static cl::opt EnableIndVarRegisterHeur( "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving")); static cl::opt EnableCondStoresVectorization( "enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); static cl::opt MaxNestedScalarReductionIC( "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); static cl::opt PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); static cl::opt ForceOrderedReductions( "force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions")); static cl::opt PreferPredicatedReductionSelect( "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc( "Prefer predicating a reduction operation over an after loop select.")); namespace llvm { cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); } // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the // verification of the H-CFGs built. static cl::opt VPlanBuildStressTest( "vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc( "Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path).")); cl::opt llvm::EnableLoopInterleaving( "interleave-loops", cl::init(true), cl::Hidden, cl::desc("Enable loop interleaving in Loop vectorization passes")); cl::opt llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); static cl::opt PrintVPlansInDotFormat( "vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans")); static cl::opt ForceSafeDivisor( "force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc( "Override cost based safe divisor widening for div/rem instructions")); /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. static bool hasIrregularType(Type *Ty, const DataLayout &DL) { // Determine if an array of N elements of type Ty is "bitcast compatible" // with a vector. // This is only true if there is no padding between the array elements. return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } /// A helper function that returns the reciprocal of the block probability of /// predicated blocks. If we return X, we are assuming the predicated block /// will execute once for every X iterations of the loop header. /// /// TODO: We should use actual block probability here, if available. Currently, /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } /// A helper function that returns an integer or floating-point constant with /// value C. static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) : ConstantFP::get(Ty, C); } /// Returns "best known" trip count for the specified loop \p L as defined by /// the following procedure: /// 1) Returns exact trip count if it is known. /// 2) Returns expected trip count according to profile data if any. /// 3) Returns upper bound estimate if it is known. /// 4) Returns std::nullopt if all of the above failed. static std::optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { // Check if exact trip count is known. if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) return ExpectedTC; // Check if there is an expected trip count available from profile data. if (LoopVectorizeWithBlockFrequency) if (auto EstimatedTC = getLoopEstimatedTripCount(L)) return *EstimatedTC; // Check if upper bound estimate is known. if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) return ExpectedTC; return std::nullopt; } /// Return a vector containing interleaved elements from multiple /// smaller input vectors. static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, const Twine &Name) { unsigned Factor = Vals.size(); assert(Factor > 1 && "Tried to interleave invalid number of vectors"); VectorType *VecTy = cast(Vals[0]->getType()); #ifndef NDEBUG for (Value *Val : Vals) assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); #endif // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); return Builder.CreateIntrinsic( WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, /*FMFSource=*/nullptr, Name); } // Fixed length. Start by concatenating all vectors into a wide vector. Value *WideVec = concatenateVectors(Builder, Vals); // Interleave the elements into the wide vector. const unsigned NumElts = VecTy->getElementCount().getFixedValue(); return Builder.CreateShuffleVector( WideVec, createInterleaveMask(NumElts, Factor), Name); } namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; using SCEV2ValueTy = DenseMap; } // namespace namespace llvm { AnalysisKey ShouldRunExtraVectorPasses::Key; /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple /// scalars. This class also implements the following features: /// * It inserts an epilogue loop for handling loops that don't have iteration /// counts that are known to be a multiple of the vectorization factor. /// * It handles the code generation for reduction variables. /// * Scalarization (implementation using scalars) of un-vectorizable /// instructions. /// InnerLoopVectorizer does not perform any vectorization-legality /// checks, and relies on the caller to check for the different legality /// aspects. The InnerLoopVectorizer relies on the /// LoopVectorizationLegality class to provide information about the induction /// and reduction variables that were found to a given vectorization factor. class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); if (MinProfitableTripCount.isZero()) this->MinProfitableTripCount = VecWidth; else this->MinProfitableTripCount = MinProfitableTripCount; } virtual ~InnerLoopVectorizer() = default; /// Create a new empty loop that will contain vectorized instructions later /// on, while the old loop will be used as the scalar remainder. Control flow /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new /// loop and the start value for the canonical induction, if it is != 0. The /// latter is the case when vectorizing the epilogue loop. In the case of /// epilogue vectorization, this function is overriden to handle the more /// complex control flow around the loops. \p ExpandedSCEVs is used to /// look up SCEV expansions for expressions needed during skeleton creation. virtual std::pair createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } /// A type for vectorized values in the new loop. Each value from the /// original loop, when vectorized, is represented by UF vector values in the /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector; /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State); /// Try to vectorize interleaved access group \p Group with the base address /// given in \p Addr, optionally masking the vector operations if \p /// BlockInMask is non-null. Use \p State to translate given VPValues to IR /// values in the vectorized loop. void vectorizeInterleaveGroup(const InterleaveGroup *Group, ArrayRef VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps); /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); /// Returns true if the reordering of FP operations is not allowed, but we are /// able to vectorize with strict in-order reductions for the given RdxDesc. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); // Returns the resume value (bc.merge.rdx) for a reduction as // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. \p Step is the SCEV-expanded induction step to use. In cases /// where the loop skeleton is more complicated (i.e., epilogue vectorization) /// and the resume values can come from an additional bypass block, the \p /// AdditionalBypass pair provides information about the bypass block and the /// end value on the edge from bypass to this loop. PHINode *createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef BypassBlocks, std::pair AdditionalBypass = {nullptr, nullptr}); /// Returns the original loop trip count. Value *getTripCount() const { return TripCount; } /// Used to set the trip count after ILV's construction and after the /// preheader block has been executed. Note that this always holds the trip /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } protected: friend class LoopVectorizationPlanner; /// A small list of PHINodes. using PhiVector = SmallVector; /// A type for scalarized values in the new loop. Each value from the /// original loop, when scalarized, is represented by UF x VF scalar values /// in the new unrolled loop, where UF is the unroll factor and VF is the /// vectorization factor. using ScalarParts = SmallVector, 2>; /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); /// Create the exit value of first order recurrences in the middle block and /// update their users. void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State); /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); /// Returns a bitcasted value to the requested vector type. /// Also handles bitcasts of vector <-> vector types. Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL); /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. void emitIterationCountCheck(BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. Returns the block containing the checks or /// nullptr if no checks have been added. BasicBlock *emitSCEVChecks(BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. /// Returns the block containing the checks or nullptr if no checks have been /// added. BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, /// vector loop preheader, middle block and scalar preheader. void createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count /// in the scalar epilogue, from where the vectorized loop left off. /// In cases where the loop skeleton is more complicated (eg. epilogue /// vectorization) and the resume values can come from an additional bypass /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( const SCEV2ValueTy &ExpandedSCEVs, std::pair AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and /// running the verifier. Return the preheader of the completed vector loop. BasicBlock *completeLoopSkeleton(); /// Collect poison-generating recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those /// recipes meet the following conditions: /// * Contribute to the address computation of a recipe generating a widen /// memory load/store (VPWidenMemoryInstructionRecipe or /// VPInterleaveRecipe). /// * Such a widen memory load/store has at least one underlying Instruction /// that is in a basic block that needs predication and after vectorization /// the generated instruction won't be predicated. void collectPoisonGeneratingRecipes(VPTransformState &State); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart(){}; virtual void printDebugTracesAtEnd(){}; /// The original loop. Loop *OrigLoop; /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies /// dynamic knowledge to simplify SCEV expressions and converts them to a /// more usable form. PredicatedScalarEvolution &PSE; /// Loop Info. LoopInfo *LI; /// Dominator Tree. DominatorTree *DT; /// Target Library Info. const TargetLibraryInfo *TLI; /// Target Transform Info. const TargetTransformInfo *TTI; /// Assumption Cache. AssumptionCache *AC; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. ElementCount VF; ElementCount MinProfitableTripCount; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; /// The builder that we use IRBuilder<> Builder; // --- Vectorization state --- /// The vector-loop preheader. BasicBlock *LoopVectorPreHeader; /// The scalar-loop preheader. BasicBlock *LoopScalarPreHeader; /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; /// The unique ExitBlock of the scalar loop if one exists. Note that /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; /// The scalar loop body. BasicBlock *LoopScalarBody; /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; /// Store instructions that were predicated. SmallVector PredicatedInstructions; /// Trip count of the original loop. Value *TripCount = nullptr; /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) Value *VectorTripCount = nullptr; /// The legality analysis. LoopVectorizationLegality *Legal; /// The profitablity analysis. LoopVectorizationCostModel *Cost; // Record whether runtime checks are added. bool AddedSafetyChecks = false; // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; /// Structure to hold information about generated runtime checks, responsible /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; // Holds the resume values for reductions in the loops, used to set the // correct start value of reduction PHIs when vectorizing the epilogue. SmallMapVector ReductionResumeValues; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} }; /// Encapsulate information regarding vectorization of a loop and its epilogue. /// This information is meant to be updated and used across two stages of /// epilogue vectorization. struct EpilogueLoopVectorizationInfo { ElementCount MainLoopVF = ElementCount::getFixed(0); unsigned MainLoopUF = 0; ElementCount EpilogueVF = ElementCount::getFixed(0); unsigned EpilogueUF = 0; BasicBlock *MainLoopIterationCountCheck = nullptr; BasicBlock *EpilogueIterationCountCheck = nullptr; BasicBlock *SCEVSafetyCheck = nullptr; BasicBlock *MemSafetyCheck = nullptr; Value *TripCount = nullptr; Value *VectorTripCount = nullptr; EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF) : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { assert(EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."); } }; /// An extension of the inner loop vectorizer that creates a skeleton for a /// vectorized loop that has its epilogue (residual) also vectorized. /// The idea is to run the vplan on a given loop twice, firstly to setup the /// skeleton and vectorize the main loop, and secondly to complete the skeleton /// from the first step and vectorize the epilogue. This is achieved by /// deriving two concrete strategy classes from this base class and invoking /// them in succession from the loop vectorizer planner. class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { public: InnerLoopAndEpilogueVectorizer( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the // three loops. std::pair createVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) final { return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. virtual std::pair createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid /// regenerating and recomputing runtime safety checks. It also helps us to /// shorten the iteration-count-check path length for the cases where the /// iteration count of the loop is so small that the main vector loop is /// completely skipped. EpilogueLoopVectorizationInfo &EPI; }; /// A specialized derived class of inner loop vectorizer that performs /// vectorization of *main* loops in the process of vectorizing loops and their /// epilogues. class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { public: EpilogueVectorizerMainLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). std::pair createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: /// Emits an iteration count bypass check once for the main loop (when \p /// ForEpilogue is false) and once for the epilogue loop (when \p /// ForEpilogue is true). BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; // A specialized derived class of inner loop vectorizer that performs // vectorization of *epilogue* loops in the process of vectorizing loops and // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { public: EpilogueVectorizerEpilogueLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, LVL, CM, BFI, PSI, Checks) { TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). std::pair createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: /// Emits an iteration count bypass check after the main vector loop has /// finished to see if there are any iterations left to execute by either /// the vector epilogue or the scalar epilogue. BasicBlock *emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { if (!I) return I; DebugLoc Empty; if (I->getDebugLoc() != Empty) return I; for (Use &Op : I->operands()) { if (Instruction *OpInst = dyn_cast(Op)) if (OpInst->getDebugLoc() != Empty) return OpInst; } return I; } /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I /// is passed, the message relates to that particular instruction. #ifndef NDEBUG static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I) { dbgs() << "LV: " << Prefix << DebugMsg; if (I != nullptr) dbgs() << " " << *I; else dbgs() << '.'; dbgs() << '\n'; } #endif /// Create an analysis remark that explains why vectorization failed /// /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p /// RemarkName is the identifier for the remark. If \p I is passed it is an /// instruction that prevents vectorization. Otherwise \p TheLoop is used for /// the location of the remark. \return the remark object that can be /// streamed to. static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I) { Value *CodeRegion = TheLoop->getHeader(); DebugLoc DL = TheLoop->getStartLoc(); if (I) { CodeRegion = I->getParent(); // If there is no debug location attached to the instruction, revert back to // using the loop's. if (I->getDebugLoc()) DL = I->getDebugLoc(); } return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } namespace llvm { /// Return a value for Step multiplied by VF. Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); } /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { return B.CreateElementCount(Ty, VF); } const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop) { const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); assert(!isa(BackedgeTakenCount) && "Invalid loop count"); ScalarEvolution &SE = *PSE.getSE(); return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); } static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, ElementCount VF) { assert(FTy->isFloatingPointTy() && "Expected floating point type!"); Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); return B.CreateUIToFP(RuntimeVF, FTy); } void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); ORE->emit( createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) << "loop not vectorized: " << OREMsg); } void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); ORE->emit( createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) << Msg); } } // end namespace llvm #ifndef NDEBUG /// \return string containing a file name and a line # for the given loop. static std::string getDebugLocString(const Loop *L) { std::string Result; if (L) { raw_string_ostream OS(Result); if (const DebugLoc LoopDbgLoc = L->getStartLoc()) LoopDbgLoc.print(OS); else // Just print the module name. OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); OS.flush(); } return Result; } #endif void InnerLoopVectorizer::collectPoisonGeneratingRecipes( VPTransformState &State) { // Collect recipes in the backward slice of `Root` that may generate a poison // value that is used after vectorization. SmallPtrSet Visited; auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { SmallVector Worklist; Worklist.push_back(Root); // Traverse the backward slice of Root through its use-def chain. while (!Worklist.empty()) { VPRecipeBase *CurRec = Worklist.back(); Worklist.pop_back(); if (!Visited.insert(CurRec).second) continue; // Prune search if we find another recipe generating a widen memory // instruction. Widen memory instructions involved in address computation // will lead to gather/scatter instructions, which don't need to be // handled. if (isa(CurRec) || isa(CurRec) || isa(CurRec) || isa(CurRec) || isa(CurRec)) continue; // This recipe contributes to the address computation of a widen // load/store. If the underlying instruction has poison-generating flags, // drop them directly. if (auto *RecWithFlags = dyn_cast(CurRec)) { RecWithFlags->dropPoisonGeneratingFlags(); } else { Instruction *Instr = CurRec->getUnderlyingInstr(); (void)Instr; assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && "found instruction with poison generating flags not covered by " "VPRecipeWithIRFlags"); } // Add new definitions to the worklist. for (VPValue *operand : CurRec->operands()) if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) Worklist.push_back(OpDef); } }); // Traverse all the recipes in the VPlan and collect the poison-generating // recipes in the backward slice starting at the address of a VPWidenRecipe or // VPInterleaveRecipe. auto Iter = vp_depth_first_deep(State.Plan->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &Recipe : *VPBB) { if (auto *WidenRec = dyn_cast(&Recipe)) { Instruction &UnderlyingInstr = WidenRec->getIngredient(); VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); if (AddrDef && WidenRec->isConsecutive() && Legal->blockNeedsPredication(UnderlyingInstr.getParent())) collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); } else if (auto *InterleaveRec = dyn_cast(&Recipe)) { VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); if (AddrDef) { // Check if any member of the interleave group needs predication. const InterleaveGroup *InterGroup = InterleaveRec->getInterleaveGroup(); bool NeedPredication = false; for (int I = 0, NumMembers = InterGroup->getNumMembers(); I < NumMembers; ++I) { Instruction *Member = InterGroup->getMember(I); if (Member) NeedPredication |= Legal->blockNeedsPredication(Member->getParent()); } if (NeedPredication) collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); } } } } } PHINode *InnerLoopVectorizer::getReductionResumeValue( const RecurrenceDescriptor &RdxDesc) { auto It = ReductionResumeValues.find(&RdxDesc); assert(It != ReductionResumeValues.end() && "Expected to find a resume value for the reduction."); return It->second; } namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be // lowered. enum ScalarEpilogueLowering { // The default: allowing scalar epilogues. CM_ScalarEpilogueAllowed, // Vectorization with OptForSize: don't allow epilogues. CM_ScalarEpilogueNotAllowedOptSize, // A special case of vectorisation with OptForSize: loops with a very small // trip count are considered for vectorization under OptForSize, thereby // making sure the cost of their loop body is dominant, free of runtime // guards and scalar iteration overheads. CM_ScalarEpilogueNotAllowedLowTripLoop, // Loop hint predicate indicating an epilogue is undesired. CM_ScalarEpilogueNotNeededUsePredicate, // Directive indicating we must either tail fold or not vectorize CM_ScalarEpilogueNotAllowedUsePredicate }; using InstructionVFPair = std::pair; /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of /// a number of reasons. In this class we mainly attempt to predict the /// expected speedup/slowdowns due to the supported instruction set. We use the /// TargetTransformInfo to query the different backends for the cost of /// different operations. class LoopVectorizationCostModel { public: LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factors (both fixed and /// scalable). If the factors are 0, vectorization and interleaving should be /// avoided up front. FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. bool runtimeChecksRequired(); /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); return expectedCost(UserVF).first.isValid(); } /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. std::pair getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. /// This function takes cost-based decisions for Load/Store instructions /// and collects them in a map. This decisions map is used for building /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. /// The key is ClassID of target-provided register class. SmallMapVector LoopInvariantRegs; /// Holds the maximum number of concurrent live intervals in the loop. /// The key is ClassID of target-provided register class. SmallMapVector MaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the /// given vectorization factors. SmallVector calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); /// Collect all element types in the loop for which widening is needed. void collectElementTypesForWidening(); /// Split reductions into those that happen in the loop, and those that happen /// outside. In loop reductions are collected into InLoopReductionChains. void collectInLoopReductions(); /// Returns true if we should use strict in-order reductions for the given /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, /// the IsOrdered flag of RdxDesc is set and we do not allow reordering /// of FP operations. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { return !Hints->allowReordering() && RdxDesc.isOrdered(); } /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. const MapVector &getMinimalBitwidths() const { return MinBWs; } /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { assert(VF.isVector() && "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return false; auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); return Scalars->second.contains(I); } /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { // Pseudo probe needs to be duplicated for each unrolled iteration and // vector lane so that profiled loop trip count can be accurately // accumulated instead of being under counted. if (isa(I)) return false; if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return false; auto UniformsPerVF = Uniforms.find(VF); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); return UniformsPerVF->second.count(I); } /// Returns true if \p I is known to be scalar after vectorization. bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return false; auto ScalarsPerVF = Scalars.find(VF); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); return ScalarsPerVF->second.count(I); } /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { return VF.isVector() && MinBWs.contains(I) && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } /// Decision that was taken during cost calculation for memory instruction. enum InstWidening { CM_Unknown, CM_Widen, // For consecutive accesses with stride +1. CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, CM_Scalarize }; /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. void setWideningDecision(const InterleaveGroup *Grp, ElementCount VF, InstWidening W, InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { if (auto *I = Grp->getMember(i)) { if (Grp->getInsertPos() == I) WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); else WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); } } } /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; std::pair InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; return Itr->second.first; } /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. InstructionCost getWideningCost(Instruction *I, ElementCount VF) { assert(VF.isVector() && "Expected VF >=2"); std::pair InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.contains(InstOnVF) && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; } /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast(I); if (!Trunc) return false; // Get the source and destination types of the truncate. Type *SrcTy = ToVectorTy(cast(I)->getSrcTy(), VF); Type *DestTy = ToVectorTy(cast(I)->getDestTy(), VF); // If the truncate is free for the given types, return false. Replacing a // free truncate with an induction variable would add an induction variable // update instruction to each iteration of the loop. We exclude from this // check the primary induction variable since it will need an update // instruction regardless. Value *Op = Trunc->getOperand(0); if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) return false; // If the truncated value is not an induction variable, return false. return Legal->isInductionPhi(Op); } /// Collects the instructions to scalarize for each predicated instruction in /// the loop. void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. if (VF.isScalar() || Uniforms.contains(VF)) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); collectLoopScalars(VF); } /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { return Legal->isConsecutivePtr(DataType, Ptr) && TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { return Legal->isConsecutivePtr(DataType, Ptr) && TTI.isLegalMaskedLoad(DataType, Alignment); } /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. bool isLegalGatherOrScatter(Value *V, ElementCount VF) { bool LI = isa(V); bool SI = isa(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); if (VF.isVector()) Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } /// Returns true if the target machine supports all of the reduction /// variables found for the given VF. bool canVectorizeReductions(ElementCount VF) const { return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; return TTI.isLegalToVectorizeReduction(RdxDesc, VF); })); } /// Given costs for both strategies, return true if the scalar predication /// lowering should be used for div/rem. This incorporates an override /// option so it is not simply a cost comparison. bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const { switch (ForceSafeDivisor) { case cl::BOU_UNSET: return ScalarCost < SafeDivisorCost; case cl::BOU_TRUE: return false; case cl::BOU_FALSE: return true; }; llvm_unreachable("impossible case value"); } /// Returns true if \p I is an instruction which requires predication and /// for which our chosen predication strategy is scalarization (i.e. we /// don't have an alternate strategy such as masking available). /// \p VF is the vectorization factor that will be used to vectorize \p I. bool isScalarWithPredication(Instruction *I, ElementCount VF) const; /// Returns true if \p I is an instruction that needs to be predicated /// at runtime. The result is independent of the predication mechanism. /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) const; /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable /// vectors); second is for the safe-divisor strategy. std::pair getDivRemSpeculationCost(Instruction *I, ElementCount VF) const; /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); } /// Get the interleaved access group that \p Instr belongs to. const InterleaveGroup * getInterleavedAccessGroup(Instruction *Instr) { return InterleaveInfo.getInterleaveGroup(Instr); } /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { if (!isScalarEpilogueAllowed()) return false; // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return true; return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop for all VFs in \p Range. /// A scalar epilogue must either be required for all VFs in \p Range or for /// none. bool requiresScalarEpilogue(VFRange Range) const { auto RequiresScalarEpilogue = [this](ElementCount VF) { return requiresScalarEpilogue(VF.isVector()); }; bool IsRequired = all_of(Range, RequiresScalarEpilogue); assert( (IsRequired || none_of(Range, RequiresScalarEpilogue)) && "all VFs in range must agree on whether a scalar epilogue is required"); return IsRequired; } /// Returns true if a scalar epilogue is not allowed due to optsize or a /// loop hint annotation. bool isScalarEpilogueAllowed() const { return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { if (!CanFoldTailByMasking) return TailFoldingStyle::None; if (ForceTailFoldingStyle.getNumOccurrences()) return ForceTailFoldingStyle; return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); } /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return getTailFoldingStyle() != TailFoldingStyle::None; } /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { return foldTailByMasking() || Legal->blockNeedsPredication(BB); } /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. using ReductionChainMap = SmallMapVector, 4>; /// Return the chain of instructions representing an inloop reduction. const ReductionChainMap &getInLoopReductionChains() const { return InLoopReductionChains; } /// Returns true if the Phi is part of an inloop reduction. bool isInLoopReduction(PHINode *Phi) const { return InLoopReductionChains.count(Phi); } /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask = nullptr) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { WideningDecisions.clear(); Uniforms.clear(); Scalars.clear(); } /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on vector values after type legalization in the backend. If this /// latter value is false, then all operations will be scalarized (i.e. no /// vectorization has actually taken place). using VectorizationCostTy = std::pair; /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. If \p Invalid is not nullptr, this function /// will add a pair(Instruction*, ElementCount) to \p Invalid for /// each instruction that has an Invalid cost for the given VF. VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl *Invalid = nullptr); bool hasPredStores() const { return NumPredStores > 0; } /// Returns true if epilogue vectorization is considered profitable, and /// false otherwise. /// \p VF is the vectorization factor chosen for the original loop. bool isEpilogueVectorizationProfitable(const ElementCount VF) const; private: unsigned NumPredStores = 0; /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. std::optional getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind); /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. MapVector MinBWs; /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. using ScalarCostsTy = DenseMap; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. DenseMap> PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at /// least once. This may be needed as a fallback loop in case runtime /// aliasing/dependence checks fail, or to handle the tail/remainder /// iterations when the trip count is unknown or doesn't divide by the VF, /// or as a peel-loop to handle gaps in interleave-groups. /// Under optsize and when the trip count is very small we don't allow any /// iterations to execute in the scalar loop. ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. bool CanFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top /// (PHI) to bottom ReductionChainMap InLoopReductionChains; /// A Map of inloop reduction operations and their immediate chain operand. /// FIXME: This can be removed once reductions can be costed correctly in /// vplan. This was added to allow quick lookup to the inloop operations, /// without having to loop through InLoopReductionChains. DenseMap InLoopReductionImmediateChains; /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. InstructionCost computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of /// uniform instructions include pointer operands of consecutive or /// interleaved memory accesses. Note that although uniformity implies an /// instruction will be scalar, the reverse is not true. In general, a /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. collectLoopScalars should only add non-uniform nodes /// to the list if they are used by a load/store instruction that is marked as /// CM_Scalarize. Non-uniform scalarized instructions will be represented by /// VF values in the vectorized loop, each corresponding to an iteration of /// the original scalar loop. void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the // scalars are not computed yet. This can happen, because it is called // via getScalarizationOverhead from setCostBasedWideningDecision, before // the scalars are collected. That should be a safe assumption in most // cases, because we check if the operands have vectorizable types // beforehand in LoopVectorizationLegality. return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); }; /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, ElementCount VF) const { return SmallVector(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } public: /// The loop that we evaluate. Loop *TheLoop; /// Predicated scalar evolution analysis. PredicatedScalarEvolution &PSE; /// Loop Info analysis. LoopInfo *LI; /// Vectorization legality. LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; /// Target Library Info. const TargetLibraryInfo *TLI; /// Demanded bits analysis. DemandedBits *DB; /// Assumption cache. AssumptionCache *AC; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; const Function *TheFunction; /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; /// The interleave access information contains groups of interleaved accesses /// with the same stride and close to each other. InterleavedAccessInfo &InterleaveInfo; /// Values to ignore in the cost model. SmallPtrSet ValuesToIgnore; /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; /// All element types found in the loop. SmallPtrSet ElementTypesInLoop; }; } // end namespace llvm namespace { /// Helper struct to manage generating runtime checks for vectorization. /// /// The runtime checks are created up-front in temporary blocks to allow better /// estimating the cost and un-linked from the existing IR. After deciding to /// vectorize, the checks are moved back. If deciding not to vectorize, the /// temporary blocks are completely removed. class GeneratedRTChecks { /// Basic block which contains the generated SCEV checks, if any. BasicBlock *SCEVCheckBlock = nullptr; /// The value representing the result of the generated SCEV checks. If it is /// nullptr, either no SCEV checks have been generated or they have been used. Value *SCEVCheckCond = nullptr; /// Basic block which contains the generated memory runtime checks, if any. BasicBlock *MemCheckBlock = nullptr; /// The value representing the result of the generated memory runtime checks. /// If it is nullptr, either no memory runtime checks have been generated or /// they have been used. Value *MemRuntimeCheckCond = nullptr; DominatorTree *DT; LoopInfo *LI; TargetTransformInfo *TTI; SCEVExpander SCEVExp; SCEVExpander MemCheckExp; bool CostTooHigh = false; public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, const DataLayout &DL) : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), MemCheckExp(SE, DL, "scev.check") {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are /// un-linked from the IR and is added back during vector code generation. If /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { // Hard cutoff to limit compile-time increase in case a very large number of // runtime checks needs to be generated. // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to // profile info. CostTooHigh = LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; if (CostTooHigh) return; BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); // Use SplitBlock to create blocks for SCEV & memory runtime checks to // ensure the blocks are properly added to LoopInfo & DominatorTree. Those // may be used by SCEVExpander. The blocks will be un-linked from their // predecessors and removed from LI & DT at the end of the function. if (!UnionPred.isAlwaysTrue()) { SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, nullptr, "vector.scevcheck"); SCEVCheckCond = SCEVExp.expandCodeForPredicate( &UnionPred, SCEVCheckBlock->getTerminator()); } const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); if (RtPtrChecking.Need) { auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); auto DiffChecks = RtPtrChecking.getDiffChecks(); if (DiffChecks) { Value *RuntimeVF = nullptr; MemRuntimeCheckCond = addDiffRuntimeChecks( MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { if (!RuntimeVF) RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); return RuntimeVF; }, IC); } else { MemRuntimeCheckCond = addRuntimeChecks(MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), MemCheckExp); } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required"); } if (!MemCheckBlock && !SCEVCheckBlock) return; // Unhook the temporary block with the checks, update various places // accordingly. if (SCEVCheckBlock) SCEVCheckBlock->replaceAllUsesWith(Preheader); if (MemCheckBlock) MemCheckBlock->replaceAllUsesWith(Preheader); if (SCEVCheckBlock) { SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); Preheader->getTerminator()->eraseFromParent(); } if (MemCheckBlock) { MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); new UnreachableInst(Preheader->getContext(), MemCheckBlock); Preheader->getTerminator()->eraseFromParent(); } DT->changeImmediateDominator(LoopHeader, Preheader); if (MemCheckBlock) { DT->eraseNode(MemCheckBlock); LI->removeBlock(MemCheckBlock); } if (SCEVCheckBlock) { DT->eraseNode(SCEVCheckBlock); LI->removeBlock(SCEVCheckBlock); } } InstructionCost getCost() { if (SCEVCheckBlock || MemCheckBlock) LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); if (CostTooHigh) { InstructionCost Cost; Cost.setInvalid(); LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); return Cost; } InstructionCost RTCheckCost = 0; if (SCEVCheckBlock) for (Instruction &I : *SCEVCheckBlock) { if (SCEVCheckBlock->getTerminator() == &I) continue; InstructionCost C = TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } if (MemCheckBlock) for (Instruction &I : *MemCheckBlock) { if (MemCheckBlock->getTerminator() == &I) continue; InstructionCost C = TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } if (SCEVCheckBlock || MemCheckBlock) LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost << "\n"); return RTCheckCost; } /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { SCEVExpanderCleaner SCEVCleaner(SCEVExp); SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); if (!SCEVCheckCond) SCEVCleaner.markResultUsed(); if (!MemRuntimeCheckCond) MemCheckCleaner.markResultUsed(); if (MemRuntimeCheckCond) { auto &SE = *MemCheckExp.getSE(); // Memory runtime check generation creates compares that use expanded // values. Remove them before running the SCEVExpanderCleaners. for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { if (MemCheckExp.isInsertedInstruction(&I)) continue; SE.forgetValue(&I); I.eraseFromParent(); } } MemCheckCleaner.cleanup(); SCEVCleaner.cleanup(); if (SCEVCheckCond) SCEVCheckBlock->eraseFromParent(); if (MemRuntimeCheckCond) MemCheckBlock->eraseFromParent(); } /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and /// adjusts the branches to branch to the vector preheader or \p Bypass, /// depending on the generated condition. BasicBlock *emitSCEVChecks(BasicBlock *Bypass, BasicBlock *LoopVectorPreHeader, BasicBlock *LoopExitBlock) { if (!SCEVCheckCond) return nullptr; Value *Cond = SCEVCheckCond; // Mark the check as used, to prevent it from being removed during cleanup. SCEVCheckCond = nullptr; if (auto *C = dyn_cast(Cond)) if (C->isZero()) return nullptr; auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); // Create new preheader for vector loop. if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, SCEVCheckBlock); DT->addNewBlock(SCEVCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); return SCEVCheckBlock; } /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts /// the branches to branch to the vector preheader or \p Bypass, depending on /// the generated condition. BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, BasicBlock *LoopVectorPreHeader) { // Check if we generated code that checks in runtime if arrays overlap. if (!MemRuntimeCheckCond) return nullptr; auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, MemCheckBlock); DT->addNewBlock(MemCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) PL->addBasicBlockToLoop(MemCheckBlock, *LI); ReplaceInstWithInst( MemCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); MemCheckBlock->getTerminator()->setDebugLoc( Pred->getTerminator()->getDebugLoc()); // Mark the check as used, to prevent it from being removed during cleanup. MemRuntimeCheckCond = nullptr; return MemCheckBlock; } }; } // namespace static bool useActiveLaneMask(TailFoldingStyle Style) { return Style == TailFoldingStyle::Data || Style == TailFoldingStyle::DataAndControlFlow || Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; } static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { return Style == TailFoldingStyle::DataAndControlFlow || Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; } // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the // vector length information is not provided, vectorization is not considered // explicit. Interleave hints are not allowed either. These limitations will be // relaxed in the future. // Please, note that we are currently forced to abuse the pragma 'clang // vectorize' semantics. This pragma provides *auto-vectorization hints* // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' // provides *explicit vectorization hints* (LV can bypass legal checks and // assume that vectorization is legal). However, both hints are implemented // using the same metadata (llvm.loop.vectorize, processed by // LoopVectorizeHints). This will be fixed in the future when the native IR // representation for pragma 'omp simd' is introduced. static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE) { assert(!OuterLp->isInnermost() && "This is not an outer loop"); LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); // Only outer loops with an explicit vectorization hint are supported. // Unannotated outer loops are ignored. if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) return false; Function *Fn = OuterLp->getHeader()->getParent(); if (!Hints.allowVectorization(Fn, OuterLp, true /*VectorizeOnlyWhenForced*/)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); return false; } if (Hints.getInterleave() > 1) { // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " "outer loops.\n"); Hints.emitRemarkWithHints(); return false; } return true; } static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl &V) { // Collect inner loops and outer loops without irreducible control flow. For // now, only collect outer loops that have explicit vectorization hints. If we // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. if (L.isInnermost() || VPlanBuildStressTest || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); if (!containsIrreducibleCFG(RPOT, *LI)) { V.push_back(&L); // TODO: Collect inner loops inside marked outer loops in case // vectorization fails for the outer loop. Do not invoke // 'containsIrreducibleCFG' again for inner loops when the outer loop is // already known to be reducible. We can use an inherited attribute for // that. return; } } for (Loop *InnerL : L) collectSupportedLoops(*InnerL, LI, ORE, V); } //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and // LoopVectorizationCostModel and LoopVectorizationPlanner. //===----------------------------------------------------------------------===// /// This function adds /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. /// \p Opcode is relevant for FP induction variable. static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, Instruction::BinaryOps BinOp, ElementCount VF, IRBuilderBase &Builder) { assert(VF.isVector() && "only vector VFs are supported"); // Create and check the types. auto *ValVTy = cast(Val->getType()); ElementCount VLen = ValVTy->getElementCount(); Type *STy = Val->getType()->getScalarType(); assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && "Induction Step must be an integer or FP"); assert(Step->getType() == STy && "Step has wrong type"); SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. VectorType *InitVecValVTy = ValVTy; if (STy->isFloatingPointTy()) { Type *InitVecValSTy = IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); InitVecValVTy = VectorType::get(InitVecValSTy, VLen); } Value *InitVec = Builder.CreateStepVector(InitVecValVTy); // Splat the StartIdx Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); if (STy->isIntegerTy()) { InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); Step = Builder.CreateVectorSplat(VLen, Step); assert(Step->getType() == Val->getType() && "Invalid step vec"); // FIXME: The newly created binary instructions should contain nsw/nuw // flags, which can be found from the original scalar operations. Step = Builder.CreateMul(InitVec, Step); return Builder.CreateAdd(Val, Step, "induction"); } // Floating point induction. assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction"); InitVec = Builder.CreateUIToFP(InitVec, ValVTy); InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); Step = Builder.CreateVectorSplat(VLen, Step); Value *MulOp = Builder.CreateFMul(InitVec, Step); return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); } /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step. static void buildScalarSteps(Value *ScalarIV, Value *Step, const InductionDescriptor &ID, VPValue *Def, VPTransformState &State) { IRBuilderBase &Builder = State.Builder; // Ensure step has the same type as that of scalar IV. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); if (ScalarIVTy != Step->getType()) { // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to // avoid separate truncate here. assert(Step->getType()->isIntegerTy() && "Truncation requires an integer step"); Step = State.Builder.CreateTrunc(Step, ScalarIVTy); } // We build scalar steps for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. Instruction::BinaryOps AddOp; Instruction::BinaryOps MulOp; if (ScalarIVTy->isIntegerTy()) { AddOp = Instruction::Add; MulOp = Instruction::Mul; } else { AddOp = ID.getInductionOpcode(); MulOp = Instruction::FMul; } // Determine the number of scalars we need to generate for each unroll // iteration. bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; if (!FirstLaneOnly && State.VF.isScalable()) { VecIVTy = VectorType::get(ScalarIVTy, State.VF); UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); SplatStep = Builder.CreateVectorSplat(State.VF, Step); SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); } unsigned StartPart = 0; unsigned EndPart = State.UF; unsigned StartLane = 0; unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); if (State.Instance) { StartPart = State.Instance->Part; EndPart = StartPart + 1; StartLane = State.Instance->Lane.getKnownLane(); EndLane = StartLane + 1; } for (unsigned Part = StartPart; Part < EndPart; ++Part) { Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); if (!FirstLaneOnly && State.VF.isScalable()) { auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); State.set(Def, Add, Part); // It's useful to record the lane values too for the known minimum number // of elements so we do those below. This improves the code quality when // trying to extract the first element, for example. } if (ScalarIVTy->isFloatingPointTy()) StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { Value *StartIdx = Builder.CreateBinOp( AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. assert((State.VF.isScalable() || isa(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); State.set(Def, Add, VPIteration(Part, Lane)); } } } /// Compute the transformed value of Index at offset StartValue using step /// StepValue. /// For integer induction, returns StartValue + Index * StepValue. /// For pointer induction, returns StartValue[Index * StepValue]. /// FIXME: The newly created binary instructions should contain nsw/nuw /// flags, which can be found from the original scalar operations. static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, const InductionDescriptor &ID) { Type *StepTy = Step->getType(); Value *CastedIndex = StepTy->isIntegerTy() ? B.CreateSExtOrTrunc(Index, StepTy) : B.CreateCast(Instruction::SIToFP, Index, StepTy); if (CastedIndex != Index) { CastedIndex->setName(CastedIndex->getName() + ".cast"); Index = CastedIndex; } // Note: the IR at this point is broken. We cannot use SE to create any new // SCEV and then expand it, hoping that SCEV's simplification will give us // a more optimal code. Unfortunately, attempt of doing so on invalid IR may // lead to various SCEV crashes. So all we can do is to use builder and rely // on InstCombine for future simplifications. Here we handle some trivial // cases only. auto CreateAdd = [&B](Value *X, Value *Y) { assert(X->getType() == Y->getType() && "Types don't match!"); if (auto *CX = dyn_cast(X)) if (CX->isZero()) return Y; if (auto *CY = dyn_cast(Y)) if (CY->isZero()) return X; return B.CreateAdd(X, Y); }; // We allow X to be a vector type, in which case Y will potentially be // splatted into a vector with the same element count. auto CreateMul = [&B](Value *X, Value *Y) { assert(X->getType()->getScalarType() == Y->getType() && "Types don't match!"); if (auto *CX = dyn_cast(X)) if (CX->isOne()) return Y; if (auto *CY = dyn_cast(Y)) if (CY->isOne()) return X; VectorType *XVTy = dyn_cast(X->getType()); if (XVTy && !isa(Y->getType())) Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); return B.CreateMul(X, Y); }; switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(!isa(Index->getType()) && "Vector indices not supported for integer inductions yet"); assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); if (isa(Step) && cast(Step)->isMinusOne()) return B.CreateSub(StartValue, Index); auto *Offset = CreateMul(Index, Step); return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); } case InductionDescriptor::IK_FpInduction: { assert(!isa(Index->getType()) && "Vector indices not supported for FP inductions yet"); assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); auto InductionBinOp = ID.getInductionBinOp(); assert(InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && "Original bin op should be defined for FP induction"); Value *MulExp = B.CreateFMul(Step, Index); return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, "induction"); } case InductionDescriptor::IK_NoInduction: return nullptr; } llvm_unreachable("invalid enum"); } std::optional getMaxVScale(const Function &F, const TargetTransformInfo &TTI) { if (std::optional MaxVScale = TTI.getMaxVScale()) return MaxVScale; if (F.hasFnAttribute(Attribute::VScaleRange)) return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); return std::nullopt; } /// For the given VF and UF and maximum trip count computed for the loop, return /// whether the induction variable might overflow in the vectorized loop. If not, /// then we know a runtime overflow check always evaluates to false and can be /// removed. static bool isIndvarOverflowCheckKnownFalse( const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional UF = std::nullopt) { // Always be conservative if we don't know the exact unroll factor. unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); Type *IdxTy = Cost->Legal->getWidestInductionType(); APInt MaxUIntTripCount = cast(IdxTy)->getMask(); // We know the runtime overflow check is known false iff the (max) trip-count // is known and (max) trip-count + (VF * UF) does not overflow in the type of // the vector loop induction variable. if (unsigned TC = Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { uint64_t MaxVF = VF.getKnownMinValue(); if (VF.isScalable()) { std::optional MaxVScale = getMaxVScale(*Cost->TheFunction, Cost->TTI); if (!MaxVScale) return false; MaxVF *= *MaxVScale; } return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); } return false; } void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State) { Value *ScalarInst = State.get(Def, Instance); Value *VectorValue = State.get(Def, Instance.Part); VectorValue = Builder.CreateInsertElement( VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); State.set(Def, VectorValue, Instance.Part); } // Return whether we allow using masked interleave-groups (for dealing with // strided loads/stores that reside in predicated blocks, or for dealing // with gaps). static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // If an override option has been passed in for interleaved accesses, use it. if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) return EnableMaskedInterleavedMemAccesses; return TTI.enableMaskedInterleavedAccessVectorization(); } // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): // for (i = 0; i < N; i+=3) { // R = Pic[i]; // Member of index 0 // G = Pic[i+1]; // Member of index 1 // B = Pic[i+2]; // Member of index 2 // ... // do something to R, G, B // } // To: // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements // // Or translate following interleaved store group (factor = 3): // for (i = 0; i < N; i+=3) { // ... do something to R, G, B // Pic[i] = R; // Member of index 0 // Pic[i+1] = G; // Member of index 1 // Pic[i+2] = B; // Member of index 2 // } // To: // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup( const InterleaveGroup *Group, ArrayRef VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. SmallVector AddrParts; unsigned Index = Group->getIndex(Instr); // TODO: extend the masked interleaved-group support to reversed access. assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); Value *Idx; // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. if (Group->isReverse()) { Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); Idx = Builder.CreateNeg(Idx); } else Idx = Builder.getInt32(-Index); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); State.setDebugLocFromInst(AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. // // E.g. a = A[i+1]; // Member of index 1 (Current instruction) // b = A[i]; // Member of index 0 // Current pointer is pointed to A[i+1], adjust it to A[i]. // // E.g. A[i+1] = a; // Member of index 1 // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. bool InBounds = false; if (auto *gep = dyn_cast(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); // Cast to the vector pointer type. unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); Type *PtrTy = VecTy->getPointerTo(AddressSpace); AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } State.setDebugLocFromInst(Instr); Value *PoisonVec = PoisonValue::get(VecTy); auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( unsigned Part, Value *MaskForGaps) -> Value * { if (VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); auto *BlockInMaskPart = State.get(BlockInMask, Part); SmallVector Ops = {BlockInMaskPart, BlockInMaskPart}; auto *MaskTy = VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); return Builder.CreateIntrinsic( MaskTy, Intrinsic::experimental_vector_interleave2, Ops, /*FMFSource=*/nullptr, "interleaved.mask"); } if (!BlockInMask) return MaskForGaps; Value *BlockInMaskPart = State.get(BlockInMask, Part); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), "interleaved.mask"); return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; }; // Vectorize the interleaved load group. if (isa(Instr)) { Value *MaskForGaps = nullptr; if (NeedsMaskForGaps) { MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } // For each unroll part, create a wide load for the group. SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); Value *GroupMask = CreateGroupMask(Part, MaskForGaps); NewLoad = Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } if (VecTy->isScalableTy()) { assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); for (unsigned Part = 0; Part < UF; ++Part) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. Value *DI = Builder.CreateIntrinsic( Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], /*FMFSource=*/nullptr, "strided.vec"); unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); if (!Member) continue; Value *StridedVec = Builder.CreateExtractValue(DI, I); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec, Part); ++J; } } return; } // For each member in the group, shuffle out the appropriate data from the // wide loads. unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); // Skip the gaps in the group. if (!Member) continue; auto StrideMask = createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec, Part); } ++J; } return; } // The sub vector type for current instruction. auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && "masked interleaved groups are not allowed."); assert((!MaskForGaps || !VF.isScalable()) && "masking gaps for scalable vectors is not yet supported."); for (unsigned Part = 0; Part < UF; Part++) { // Collect the stored vector from each member. SmallVector StoredVecs; unsigned StoredIdx = 0; for (unsigned i = 0; i < InterleaveFactor; i++) { assert((Group->getMember(i) || MaskForGaps) && "Fail to get a member from an interleaved store group"); Instruction *Member = Group->getMember(i); // Skip the gaps in the group. if (!Member) { Value *Undef = PoisonValue::get(SubVT); StoredVecs.push_back(Undef); continue; } Value *StoredVec = State.get(StoredValues[StoredIdx], Part); ++StoredIdx; if (Group->isReverse()) StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); // If this member has different type, cast it to a unified type. if (StoredVec->getType() != SubVT) StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); StoredVecs.push_back(StoredVec); } // Interleave all the smaller vectors into one wider vector. Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { Value *GroupMask = CreateGroupMask(Part, MaskForGaps); NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], Group->getAlign(), GroupMask); } else NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); Group->addMetadata(NewStoreInstr); } } void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for // the first lane and part. if (isa(Instr)) if (!Instance.isFirstIteration()) return; // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); RepRecipe->setFlags(Cloned); if (Instr->getDebugLoc()) State.setDebugLocFromInst(Instr); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. for (const auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; VPValue *Operand = I.value(); if (vputils::isUniformAfterVectorization(Operand)) InputInstance.Lane = VPLane::getFirstLane(); Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } State.addNewMetadata(Cloned, Instr); // Place the cloned scalar in the new loop. State.Builder.Insert(Cloned); State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) AC->registerAssumption(II); // End if-block. bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } Value * InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. Value *Step = createStepForVF(Builder, Ty, VF, UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first // adding Step-1 and then rounding down. Note that it's ok if this addition // overflows: the vector induction variable will eventually wrap to zero given // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. // For scalable vectors the VF is not guaranteed to be a power of 2, but this // is accounted for in emitIterationCountCheck that adds an overflow check. if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); TC = Builder.CreateAdd( TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the // vectorized body will execute. This is equal to N - (N % Step) if scalar // iterations are not required for correctness, or N - Step, otherwise. Step // is equal to the vectorization factor (number of SIMD elements) times the // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); // There are cases where we *must* run at least one iteration in the remainder // loop. See the cost model for when this can happen. If the step evenly // divides the trip count, we set the remainder to be equal to the step. If // the step does not evenly divide the trip count, no adjustment is necessary // since there will already be scalar iterations. Note that the minimum // iterations check ensures that N >= Step. if (Cost->requiresScalarEpilogue(VF.isVector())) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); return VectorTripCount; } Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. auto *DstFVTy = cast(DstVTy); auto VF = DstFVTy->getElementCount(); auto *SrcVecTy = cast(V->getType()); assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { return Builder.CreateBitOrPointerCast(V, DstFVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of // pointers or vice-versa. Handle this using a two-step bitcast using an // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && "Only one type should be a pointer type"); assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = VectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); auto CreateStep = [&]() -> Value * { // Create step with max(MinProTripCount, UF * VF). if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) return createStepForVF(Builder, CountTy, VF, UF); Value *MinProfTC = createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); if (!VF.isScalable()) return MinProfTC; return Builder.CreateBinaryIntrinsic( Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); }; TailFoldingStyle Style = Cost->getTailFoldingStyle(); if (Style == TailFoldingStyle::None) CheckMinIters = Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable() && !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an // additional overflow check is required before entering the vector loop. // Get the maximum unsigned value for the type. Value *MaxUIntTripCount = ConstantInt::get(CountTy, cast(CountTy)->getMask()); Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); // Don't execute the vector loop if (UMax - n) < (VF * UF). CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, "vector.ph"); assert(DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"); // Update dominator for Bypass & LoopExit (if needed). DT->changeImmediateDominator(Bypass, TCCheckBlock); if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); ReplaceInstWithInst( TCCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); LoopBypassBlocks.push_back(TCCheckBlock); } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) return nullptr; assert(!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, SCEVCheckBlock); if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); } LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; return SCEVCheckBlock; } BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) return nullptr; BasicBlock *const MemCheckBlock = RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); // Check if we generated code that checks in runtime if arrays overlap. We put // the checks into a separate block to make the more common case of few // elements faster. if (!MemCheckBlock) return nullptr; if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", OrigLoop->getStartLoc(), OrigLoop->getHeader()) << "Code-size may be reduced by not forcing " "vectorization, or by source-code modifications " "eliminating the need for runtime checks " "(e.g., adding 'restrict')."; }); } LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; return MemCheckBlock; } void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && "multiple exit loop without required epilogue?"); LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "middle.block"); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); // Set up the middle block terminator. Two cases: // 1) If we know that we must execute the scalar epilogue, emit an // unconditional branch. // 2) Otherwise, we must have a single unique exit block (due to how we // implement the multiple exit case). In this case, set up a conditional // branch from the middle block to the loop scalar preheader, and the // exit block. completeLoopSkeleton will update the condition to use an // iteration check, if required to decide whether to execute the remainder. BranchInst *BrInst = Cost->requiresScalarEpilogue(VF.isVector()) ? BranchInst::Create(LoopScalarPreHeader) : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Update dominator for loop exit. During skeleton creation, only the vector // pre-header and the middle block are created. The vector loop is entirely // created during VPlan exection. if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); } PHINode *InnerLoopVectorizer::createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, ArrayRef BypassBlocks, std::pair AdditionalBypass) { Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); assert(VectorTripCount && "Expected valid arguments"); Instruction *OldInduction = Legal->getPrimaryInduction(); Value *&EndValue = IVEndValues[OrigPhi]; Value *EndValueFromAdditionalBypass = AdditionalBypass.second; if (OrigPhi == OldInduction) { // We know what the end value is. EndValue = VectorTripCount; } else { IRBuilder<> B(LoopVectorPreHeader->getTerminator()); // Fast-math-flags propagate from the original induction instruction. if (II.getInductionBinOp() && isa(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). if (AdditionalBypass.first) { B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); EndValueFromAdditionalBypass = emitTransformedIndex( B, AdditionalBypass.second, II.getStartValue(), Step, II); EndValueFromAdditionalBypass->setName("ind.end"); } } // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : BypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); if (AdditionalBypass.first) BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, EndValueFromAdditionalBypass); return BCResumeVal; } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV /// expansion results. static Value *getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs) { const SCEV *Step = ID.getStep(); if (auto *C = dyn_cast(Step)) return C->getValue(); if (auto *U = dyn_cast(Step)) return U->getValue(); auto I = ExpandedSCEVs.find(Step); assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); return I->second; } void InnerLoopVectorizer::createInductionResumeValues( const SCEV2ValueTy &ExpandedSCEVs, std::pair AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. // The starting values of PHI nodes depend on the counter of the last // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. for (const auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // The trip counts should be cached by now. Value *Count = getTripCount(); Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. Three cases: // 1) If we require a scalar epilogue, there is no conditional branch as // we unconditionally branch to the scalar preheader. Do nothing. // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. // Thus if tail is to be folded, we know we don't need to run the // remainder and we can use the previous value for the condition (true). // 3) Otherwise, construct a runtime check. if (!Cost->requiresScalarEpilogue(VF.isVector()) && !Cost->foldTailByMasking()) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch terminator instead // of the corresponding compare because they may have ended up with // different line numbers and we want to avoid awkward line stepping while // debugging. Eg. if the compare has got a line number inside the loop. CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); cast(LoopMiddleBlock->getTerminator())->setCondition(CmpN); } #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); #endif return LoopVectorPreHeader; } std::pair InnerLoopVectorizer::createVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's / | preheader are expanded here. Eventually all required SCEV / | expansion should happen here. / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | | / v || [ ] <-- vector pre header. |/ | | v | [ ] \ | [ ]_| <-- vector loop (created during VPlan execution). | | | v \ -[ ] <--- middle-block. \/ | /\ v | ->[ ] <--- new preheader. | | (opt) v <-- edge from middle to exit iff epilogue is not required. | [ ] \ | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). \ | \ v >[ ] <-- exit block(s). ... */ // Create an empty vector loop, and prepare basic blocks for the runtime // checks. createVectorLoopSkeleton(""); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. This check also covers the case where the // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. emitIterationCountCheck(LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. createInductionResumeValues(ExpandedSCEVs); return {completeLoopSkeleton(), nullptr}; } // Fix up external users of the induction variable. At this point, we are // in LCSSA form, with all external PHIs that use the IV having one input value, // coming from the remainder loop. We need those PHIs to also have a correct // value for the IV when arriving directly from the middle block. void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); DenseMap MissingVals; // An external user of the last iteration's value should see the value that // the remainder loop uses to initialize its own IV. Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) { Instruction *UI = cast(U); if (!OrigLoop->contains(UI)) { assert(isa(UI) && "Expected LCSSA form"); MissingVals[UI] = EndValue; } } // An external user of the penultimate value need to see EndValue - Step. // The simplest way to get this is to recompute it from the constituent SCEVs, // that is Start + (Step * (CRD - 1)). for (User *U : OrigPhi->users()) { auto *UI = cast(U); if (!OrigLoop->contains(UI)) { assert(isa(UI) && "Expected LCSSA form"); IRBuilder<> B(MiddleBlock->getTerminator()); // Fast-math-flags propagate from the original induction instruction. if (II.getInductionBinOp() && isa(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); Value *CountMinusOne = B.CreateSub( VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); CountMinusOne->setName("cmo"); VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); assert(StepVPV && "step must have been expanded during VPlan execution"); Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() : State.get(StepVPV, {0, 0}); Value *Escape = emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } } for (auto &I : MissingVals) { PHINode *PHI = cast(I.first); // One corner case we have to handle is two IVs "chasing" each-other, // that is %IV2 = phi [...], [ %IV1, %latch ] // In this case, if IV1 has an external use, we need to avoid adding both // "last value of IV1" and "penultimate value of IV2". So, verify that we // don't already have an incoming value for the middle block. if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { PHI->addIncoming(I.second, MiddleBlock); Plan.removeLiveOut(PHI); } } } namespace { struct CSEDenseMapInfo { static bool canHandle(const Instruction *I) { return isa(I) || isa(I) || isa(I) || isa(I); } static inline Instruction *getEmptyKey() { return DenseMapInfo::getEmptyKey(); } static inline Instruction *getTombstoneKey() { return DenseMapInfo::getTombstoneKey(); } static unsigned getHashValue(const Instruction *I) { assert(canHandle(I) && "Unknown instruction!"); return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), I->value_op_end())); } static bool isEqual(const Instruction *LHS, const Instruction *RHS) { if (LHS == getEmptyKey() || RHS == getEmptyKey() || LHS == getTombstoneKey() || RHS == getTombstoneKey()) return LHS == RHS; return LHS->isIdenticalTo(RHS); } }; } // end anonymous namespace ///Perform cse of induction variable instructions. static void cse(BasicBlock *BB) { // Perform simple cse. SmallDenseMap CSEMap; for (Instruction &In : llvm::make_early_inc_range(*BB)) { if (!CSEDenseMapInfo::canHandle(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. if (Instruction *V = CSEMap.lookup(&In)) { In.replaceAllUsesWith(V); In.eraseFromParent(); continue; } CSEMap[&In] = &In; } } InstructionCost LoopVectorizationCostModel::getVectorCallCost( CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; bool MaskRequired = Legal->isMaskRequired(CI); for (auto &ArgOp : CI->args()) ScalarTys.push_back(ArgOp->getType()); // Estimate cost of scalarized vector call. The source operands are assumed // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. Type *RetTy = ToVectorTy(ScalarRetTy, VF); for (Type *ScalarTy : ScalarTys) Tys.push_back(ToVectorTy(ScalarTy, VF)); // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF, CostKind); InstructionCost Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. InstructionCost MaskCost = 0; VFShape Shape = VFShape::get(*CI, VF, MaskRequired); if (NeedsMask) *NeedsMask = MaskRequired; Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); // If we want an unmasked vector function but can't find one matching the VF, // maybe we can find vector function that does use a mask and synthesize // an all-true mask. if (!VecFunc && !MaskRequired) { Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); // If we found one, add in the cost of creating a mask if (VecFunc) { if (NeedsMask) *NeedsMask = true; MaskCost = TTI.getShuffleCost( TargetTransformInfo::SK_Broadcast, VectorType::get( IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), VF)); } } // We don't support masked function calls yet, but we can scalarize a // masked call with branches (unless VF is scalable). if (!TLI || CI->isNoBuiltin() || !VecFunc) return VF.isScalable() ? InstructionCost::getInvalid() : Cost; // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; if (VectorCallCost < Cost) { *Variant = VecFunc; Cost = VectorCallCost; } return Cost; } static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) return Elt; return VectorType::get(Elt, VF); } InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); Type *RetTy = MaybeVectorizeType(CI->getType(), VF); FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); SmallVector Arguments(CI->args()); FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); SmallVector ParamTys; std::transform(FTy->param_begin(), FTy->param_end(), std::back_inserter(ParamTys), [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, dyn_cast(CI)); return TTI.getIntrinsicInstrCost(CostAttrs, TargetTransformInfo::TCK_RecipThroughput); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { auto *I1 = cast(cast(T1)->getElementType()); auto *I2 = cast(cast(T2)->getElementType()); return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; } static Type *largestIntegerVectorType(Type *T1, Type *T2) { auto *I1 = cast(cast(T1)->getElementType()); auto *I2 = cast(cast(T2)->getElementType()); return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; } void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { // For every instruction `I` in MinBWs, truncate the operands, create a // truncated version of `I` and reextend its result. InstCombine runs // later and will remove any ext/trunc pairs. SmallPtrSet Erased; for (const auto &KV : Cost->getMinimalBitwidths()) { // If the value wasn't vectorized, we must maintain the original scalar // type. The absence of the value from State indicates that it // wasn't vectorized. // FIXME: Should not rely on getVPValue at this point. VPValue *Def = State.Plan->getVPValue(KV.first, true); if (!State.hasAnyVectorValue(Def)) continue; for (unsigned Part = 0; Part < UF; ++Part) { Value *I = State.get(Def, Part); if (Erased.count(I) || I->use_empty() || !isa(I)) continue; Type *OriginalTy = I->getType(); Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), KV.second); auto *TruncatedTy = VectorType::get( ScalarTruncatedTy, cast(OriginalTy)->getElementCount()); if (TruncatedTy == OriginalTy) continue; IRBuilder<> B(cast(I)); auto ShrinkOperand = [&](Value *V) -> Value * { if (auto *ZI = dyn_cast(V)) if (ZI->getSrcTy() == TruncatedTy) return ZI->getOperand(0); return B.CreateZExtOrTrunc(V, TruncatedTy); }; // The actual instruction modification depends on the instruction type, // unfortunately. Value *NewI = nullptr; if (auto *BO = dyn_cast(I)) { NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), ShrinkOperand(BO->getOperand(1))); // Any wrapping introduced by shrinking this operation shouldn't be // considered undefined behavior. So, we can't unconditionally copy // arithmetic wrapping flags to NewI. cast(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); } else if (auto *CI = dyn_cast(I)) { NewI = B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), ShrinkOperand(CI->getOperand(1))); } else if (auto *SI = dyn_cast(I)) { NewI = B.CreateSelect(SI->getCondition(), ShrinkOperand(SI->getTrueValue()), ShrinkOperand(SI->getFalseValue())); } else if (auto *CI = dyn_cast(I)) { switch (CI->getOpcode()) { default: llvm_unreachable("Unhandled cast!"); case Instruction::Trunc: NewI = ShrinkOperand(CI->getOperand(0)); break; case Instruction::SExt: NewI = B.CreateSExtOrTrunc( CI->getOperand(0), smallestIntegerVectorType(OriginalTy, TruncatedTy)); break; case Instruction::ZExt: NewI = B.CreateZExtOrTrunc( CI->getOperand(0), smallestIntegerVectorType(OriginalTy, TruncatedTy)); break; } } else if (auto *SI = dyn_cast(I)) { auto Elements0 = cast(SI->getOperand(0)->getType())->getElementCount(); auto *O0 = B.CreateZExtOrTrunc( SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); auto Elements1 = cast(SI->getOperand(1)->getType())->getElementCount(); auto *O1 = B.CreateZExtOrTrunc( SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); } else if (isa(I) || isa(I)) { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast(I)) { auto Elements = cast(IE->getOperand(0)->getType())->getElementCount(); auto *O0 = B.CreateZExtOrTrunc( IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); } else if (auto *EE = dyn_cast(I)) { auto Elements = cast(EE->getOperand(0)->getType())->getElementCount(); auto *O0 = B.CreateZExtOrTrunc( EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); NewI = B.CreateExtractElement(O0, EE->getOperand(2)); } else { // If we don't know what to do, be conservative and don't do anything. continue; } // Lastly, extend the result. NewI->takeName(cast(I)); Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); I->replaceAllUsesWith(Res); cast(I)->eraseFromParent(); Erased.insert(I); State.reset(Def, Res, Part); } } // We'll have created a bunch of ZExts that are now parentless. Clean up. for (const auto &KV : Cost->getMinimalBitwidths()) { // If the value wasn't vectorized, we must maintain the original scalar // type. The absence of the value from State indicates that it // wasn't vectorized. // FIXME: Should not rely on getVPValue at this point. VPValue *Def = State.Plan->getVPValue(KV.first, true); if (!State.hasAnyVectorValue(Def)) continue; for (unsigned Part = 0; Part < UF; ++Part) { Value *I = State.get(Def, Part); ZExtInst *Inst = dyn_cast(I); if (Inst && Inst->use_empty()) { Value *NewI = Inst->getOperand(0); Inst->eraseFromParent(); State.reset(Def, NewI, Part); } } } } void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VPlan &Plan) { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF.isVector()) truncateToMinimalBitwidths(State); // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) fixNonInductionPHIs(Plan, State); // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(State); // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); // After vectorization, the exit blocks of the original loop will have // additional predecessors. Invalidate SCEVs for the exit phis in case SE // looked through single-entry phis. SmallVector ExitBlocks; OrigLoop->getExitBlocks(ExitBlocks); for (BasicBlock *Exit : ExitBlocks) for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetValue(&PN); VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. } else { // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking // the cost model. // If we inserted an edge from the middle block to the unique exit block, // update uses outside the loop (phis) to account for the newly inserted // edge. // Fix-up external users of the induction variables. for (const auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader(), Plan, State); } // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated // in the exit block, so update the builder. State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); for (const auto &KV : Plan.getLiveOuts()) KV.second->fixPhi(Plan, State); for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); // Remove redundant induction instructions. cse(VectorLoop->getHeader()); // Set/update profile weights for the vector and remainder loops as original // loop iterations are now distributed among them. Note that original loop // represented by LoopScalarBody becomes remainder loop after vectorization. // // For cases like foldTailByMasking() and requiresScalarEpiloque() we may // end up getting slightly roughened result but that should be OK since // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. // // For scalable vectorization we can't know at compile time how many iterations // of the loop are handled in one vector iteration, so instead assume a pessimistic // vscale of '1'. setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #2: We now need to fix the recurrences by adding incoming edges to // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. VPBasicBlock *Header = State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); + + // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores + // sank outside of the loop would keep the same order as they had in the + // original loop. + SmallVector ReductionPHIList; for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast(&R)) - fixReduction(ReductionPhi, State); - else if (auto *FOR = dyn_cast(&R)) + ReductionPHIList.emplace_back(ReductionPhi); + } + stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1, + const VPReductionPHIRecipe *R2) { + auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; + auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; + + // If neither of the recipes has an intermediate store, keep the order the + // same. + if (!IS1 && !IS2) + return false; + + // If only one of the recipes has an intermediate store, then move it + // towards the beginning of the list. + if (IS1 && !IS2) + return true; + + if (!IS1 && IS2) + return false; + + // If both recipes have an intermediate store, then the recipe with the + // later store should be processed earlier. So it should go to the beginning + // of the list. + return DT->dominates(IS2, IS1); + }); + + for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList) + fixReduction(ReductionPhi, State); + + for (VPRecipeBase &R : Header->phis()) { + if (auto *FOR = dyn_cast(&R)) fixFixedOrderRecurrence(FOR, State); } } void InnerLoopVectorizer::fixFixedOrderRecurrence( VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. // // for (int i = 0; i < n; ++i) // b[i] = a[i] - a[i - 1]; // // There is a first-order recurrence on "a". For this loop, the shorthand // scalar IR looks like: // // scalar.ph: // s_init = a[-1] // br scalar.body // // scalar.body: // i = phi [0, scalar.ph], [i+1, scalar.body] // s1 = phi [s_init, scalar.ph], [s2, scalar.body] // s2 = a[i] // b[i] = s2 - s1 // br cond, scalar.body, ... // // In this example, s1 is a recurrence because it's value depends on the // previous iteration. In the first phase of vectorization, we created a // vector phi v1 for s1. We now complete the vectorization and produce the // shorthand vector IR shown below (for VF = 4, UF = 1). // // vector.ph: // v_init = vector(..., ..., ..., a[-1]) // br vector.body // // vector.body // i = phi [0, vector.ph], [i+4, vector.body] // v1 = phi [v_init, vector.ph], [v2, vector.body] // v2 = a[i, i+1, i+2, i+3]; // v3 = vector(v1(3), v2(0, 1, 2)) // b[i, i+1, i+2, i+3] = v2 - v3 // br cond, vector.body, middle.block // // middle.block: // x = v2(3) // br scalar.ph // // scalar.ph: // s_init = phi [x, middle.block], [a[-1], otherwise] // br scalar.body // // After execution completes the vector loop, we extract the next value of // the recurrence (x) to use as the initial value in the scalar loop. // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. VPValue *PreviousDef = PhiR->getBackedgeValue(); Value *Incoming = State.get(PreviousDef, UF - 1); auto *ExtractForScalar = Incoming; auto *IdxTy = Builder.getInt32Ty(); Value *RuntimeVF = nullptr; if (VF.isVector()) { auto *One = ConstantInt::get(IdxTy, 1); Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); auto *LastIdx = Builder.CreateSub(RuntimeVF, One); ExtractForScalar = Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); } auto RecurSplice = cast(*PhiR->user_begin()); assert(PhiR->getNumUsers() == 1 && RecurSplice->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice && "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); SmallVector LiveOuts; for (VPUser *U : RecurSplice->users()) if (auto *LiveOut = dyn_cast(U)) LiveOuts.push_back(LiveOut); if (!LiveOuts.empty()) { // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself // and not the last element (the phi update in the current iteration). This // will be the value when jumping to the exit block from the // LoopMiddleBlock, when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; if (VF.isVector()) { auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( Incoming, Idx, "vector.recur.extract.for.phi"); } else { assert(UF > 1 && "VF and UF cannot both be 1"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled // value of `Incoming`. This is analogous to the vectorized case above: // extracting the second last element when VF > 1. ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); } for (VPLiveOut *LiveOut : LiveOuts) { assert(!Cost->requiresScalarEpilogue(VF.isVector())); PHINode *LCSSAPhi = LiveOut->getPhi(); LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); State.Plan->removeLiveOut(LCSSAPhi); } } // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); PHINode *Phi = cast(PhiR->getUnderlyingValue()); auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); for (auto *BB : predecessors(LoopScalarPreHeader)) { auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; Start->addIncoming(Incoming, BB); } Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); Phi->setName("scalar.recur"); } void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, VPTransformState &State) { PHINode *OrigPhi = cast(PhiR->getUnderlyingValue()); // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(OrigPhi) && "Unable to find the reduction variable"); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); State.setDebugLocFromInst(ReductionStartValue); VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); // This is the vector-clone of the value that leaves the loop. Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); State.setDebugLocFromInst(LoopExitInst); Type *PhiTy = OrigPhi->getType(); VPBasicBlock *LatchVPBB = PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; // If tail is folded by masking, the vector value to leave the loop should be // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); SelectInst *Sel = nullptr; for (User *U : VecLoopExitInst->users()) { if (isa(U)) { assert(!Sel && "Reduction exit feeding two selects"); Sel = cast(U); } else assert(isa(U) && "Reduction exit must feed Phi's or select"); } assert(Sel && "Reduction exit feeds no select"); State.reset(LoopExitInstDef, Sel, Part); if (isa(Sel)) Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); // If the target can create a predicated operator for the reduction at no // extra cost in the loop (for example a predicated vadd), it can be // cheaper for the select to remain in the loop than be sunk out of it, // and so use the select value for the phi instead of the old // LoopExitValue. if (PreferPredicatedReductionSelect || TTI->preferPredicatedReductionSelect( RdxDesc.getOpcode(), PhiTy, TargetTransformInfo::ReductionFlags())) { auto *VecRdxPhi = cast(State.get(PhiR, Part)); VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); } } } // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = State.get(LoopExitInstDef, Part); Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) : Builder.CreateZExt(Trunc, VecTy); for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) if (U != Trunc) { U->replaceUsesOfWith(RdxParts[Part], Extnd); RdxParts[Part] = Extnd; } } Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); for (unsigned Part = 0; Part < UF; ++Part) { RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); State.reset(LoopExitInstDef, RdxParts[Part], Part); } } // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); unsigned Op = RecurrenceDescriptor::getOpcode(RK); // The middle block terminator has already been assigned a DebugLoc here (the // OrigLoop's single latch terminator). We want the whole middle block to // appear to execute on this line because: (a) it is all compiler generated, // (b) these instructions are always executed after evaluating the latch // conditional branch, and (c) other passes may add new predecessors which // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); if (PhiR->isOrdered()) ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); else { // Floating-point operations should have some FMF to enable the reduction. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); for (unsigned Part = 1; Part < UF; ++Part) { Value *RdxPart = State.get(LoopExitInstDef, Part); if (Op != Instruction::ICmp && Op != Instruction::FCmp) { ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, ReducedPartRdx, RdxPart); else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } } // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. if (VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) ReducedPartRdx = RdxDesc.isSigned() ? Builder.CreateSExt(ReducedPartRdx, PhiTy) : Builder.CreateZExt(ReducedPartRdx, PhiTy); } PHINode *ResumePhi = dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); // If we are fixing reductions in the epilogue loop then we should already // have created a bc.merge.rdx Phi after the main vector body. Ensure that // we carry over the incoming values correctly. for (auto *Incoming : predecessors(LoopScalarPreHeader)) { if (Incoming == LoopMiddleBlock) BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), Incoming); else BCBlockPhi->addIncoming(ReductionStartValue, Incoming); } // Set the resume value for this reduction ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); // If there were stores of the reduction value to a uniform memory address // inside the loop, create the final store here. if (StoreInst *SI = RdxDesc.IntermediateStore) { StoreInst *NewSI = Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); propagateMetadata(NewSI, SI); // If the reduction value is used in other places, // then let the code below create PHI's for that. } // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the PHI nodes // in the exit blocks. See comment on analogous loop in // fixFixedOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF.isVector())) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); State.Plan->removeLiveOut(&LCSSAPhi); } // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. int IncomingEdgeBlockIdx = OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); auto *VectorLoop = LI->getLoopFor(PredBB); // Initialize a worklist with the operands of the predicated instruction. SetVector Worklist(PredInst->op_begin(), PredInst->op_end()); // Holds instructions that we need to analyze again. An instruction may be // reanalyzed if we don't yet know if we can sink it or not. SmallVector InstsToReanalyze; // Returns true if a given use occurs in the predicated block. Phi nodes use // their operands in their corresponding predecessor blocks. auto isBlockOfUsePredicated = [&](Use &U) -> bool { auto *I = cast(U.getUser()); BasicBlock *BB = I->getParent(); if (auto *Phi = dyn_cast(I)) BB = Phi->getIncomingBlock( PHINode::getIncomingValueNumForOperand(U.getOperandNo())); return BB == PredBB; }; // Iteratively sink the scalarized operands of the predicated instruction // into the block we created for it. When an instruction is sunk, it's // operands are then added to the worklist. The algorithm ends after one pass // through the worklist doesn't sink a single instruction. bool Changed; do { // Add the instructions that need to be reanalyzed to the worklist, and // reset the changed indicator. Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); InstsToReanalyze.clear(); Changed = false; while (!Worklist.empty()) { auto *I = dyn_cast(Worklist.pop_back_val()); // We can't sink an instruction if it is a phi node, is not in the loop, // may have side effects or may read from memory. // TODO Could dor more granular checking to allow sinking a load past non-store instructions. if (!I || isa(I) || !VectorLoop->contains(I) || I->mayHaveSideEffects() || I->mayReadFromMemory()) continue; // If the instruction is already in PredBB, check if we can sink its // operands. In that case, VPlan's sinkScalarOperands() succeeded in // sinking the scalar instruction I, hence it appears in PredBB; but it // may have failed to sink I's operands (recursively), which we try // (again) here. if (I->getParent() == PredBB) { Worklist.insert(I->op_begin(), I->op_end()); continue; } // It's legal to sink the instruction if all its uses occur in the // predicated block. Otherwise, there's nothing to do yet, and we may // need to reanalyze the instruction. if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { InstsToReanalyze.push_back(I); continue; } // Move the instruction to the beginning of the predicated block, and add // it's operands to the worklist. I->moveBefore(&*PredBB->getFirstInsertionPt()); Worklist.insert(I->op_begin(), I->op_end()); // The sinking may have enabled other instructions to be sunk, so we will // need to iterate. Changed = true; } } while (Changed); } void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, VPTransformState &State) { auto Iter = vp_depth_first_deep(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &P : VPBB->phis()) { VPWidenPHIRecipe *VPPhi = dyn_cast(&P); if (!VPPhi) continue; PHINode *NewPhi = cast(State.get(VPPhi, 0)); // Make sure the builder has a valid insert point. Builder.SetInsertPoint(NewPhi); for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { VPValue *Inc = VPPhi->getIncomingValue(i); VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); } } } } bool InnerLoopVectorizer::useOrderedReductions( const RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. assert(VF.isVector() && !Scalars.contains(VF) && "This function should not be visited twice for the same VF"); // This avoids any chances of creating a REPLICATE recipe during planning // since that would result in generation of scalarized code during execution, // which is not supported for scalable vectors. if (VF.isScalable()) { Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); return; } SmallSetVector Worklist; // These sets are used to seed the analysis with pointers used by memory // accesses that will remain scalar. SmallSetVector ScalarPtrs; SmallPtrSet PossibleNonScalarPtrs; auto *Latch = TheLoop->getLoopLatch(); // A helper that returns true if the use of Ptr by MemAccess will be scalar. // The pointer operands of loads and stores will be scalar as long as the // memory access is not a gather or scatter operation. The value operand of a // store will remain scalar if the store is scalarized. auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { InstWidening WideningDecision = getWideningDecision(MemAccess, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); if (auto *Store = dyn_cast(MemAccess)) if (Ptr == Store->getValueOperand()) return WideningDecision == CM_Scalarize; assert(Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"); return WideningDecision != CM_GatherScatter; }; // A helper that returns true if the given value is a bitcast or // getelementptr instruction contained in the loop. auto isLoopVaryingBitCastOrGEP = [&](Value *V) { return ((isa(V) && V->getType()->isPointerTy()) || isa(V)) && !TheLoop->isLoopInvariant(V); }; // A helper that evaluates a memory access's use of a pointer. If the use will // be a scalar use and the pointer is only used by memory accesses, we place // the pointer in ScalarPtrs. Otherwise, the pointer is placed in // PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) return; // If the pointer has already been identified as scalar (e.g., if it was // also identified as uniform), there's nothing to do. auto *I = cast(Ptr); if (Worklist.count(I)) return; // If the use of the pointer will be a scalar use, and all users of the // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, // place the pointer in PossibleNonScalarPtrs. if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { return isa(U) || isa(U); })) ScalarPtrs.insert(I); else PossibleNonScalarPtrs.insert(I); }; // We seed the scalars analysis with three classes of instructions: (1) // instructions marked uniform-after-vectorization and (2) bitcast, // getelementptr and (pointer) phi instructions used by memory accesses // requiring a scalar use. // // (1) Add to the worklist all instructions that have been identified as // uniform-after-vectorization. Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); // (2) Add to the worklist all bitcast and getelementptr instructions used by // memory accesses requiring a scalar use. The pointer operands of loads and // stores will be scalar as long as the memory accesses is not a gather or // scatter operation. The value operand of a store will remain scalar if the // store is scalarized. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (auto *Load = dyn_cast(&I)) { evaluatePtrUse(Load, Load->getPointerOperand()); } else if (auto *Store = dyn_cast(&I)) { evaluatePtrUse(Store, Store->getPointerOperand()); evaluatePtrUse(Store, Store->getValueOperand()); } } for (auto *I : ScalarPtrs) if (!PossibleNonScalarPtrs.count(I)) { LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); Worklist.insert(I); } // Insert the forced scalars. // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector // induction variable when the PHI user is scalarized. auto ForcedScalar = ForcedScalars.find(VF); if (ForcedScalar != ForcedScalars.end()) for (auto *I : ForcedScalar->second) { LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); Worklist.insert(I); } // Expand the worklist by looking through any bitcasts and getelementptr // instructions we've already identified as scalar. This is similar to the // expansion step in collectLoopUniforms(); however, here we're only // expanding to include additional bitcasts and getelementptr instructions. unsigned Idx = 0; while (Idx != Worklist.size()) { Instruction *Dst = Worklist[Idx++]; if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) continue; auto *Src = cast(Dst->getOperand(0)); if (llvm::all_of(Src->users(), [&](User *U) -> bool { auto *J = cast(U); return !TheLoop->contains(J) || Worklist.count(J) || ((isa(J) || isa(J)) && isScalarUse(J, Src)); })) { Worklist.insert(Src); LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); } } // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. for (const auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); // If tail-folding is applied, the primary induction variable will be used // to feed a vector compare. if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; // Returns true if \p Indvar is a pointer induction that is used directly by // load/store instruction \p I. auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, Instruction *I) { return Induction.second.getKind() == InductionDescriptor::IK_PtrInduction && (isa(I) || isa(I)) && Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); }; // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || IsDirectLoadStoreFromPtrIndvar(Ind, I); }); if (!ScalarInd) continue; // Determine if all users of the induction variable update instruction are // scalar after vectorization. auto ScalarIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); }); if (!ScalarIndUpdate) continue; // The induction variable and its update instruction will remain scalar. Worklist.insert(Ind); Worklist.insert(IndUpdate); LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } Scalars[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationCostModel::isScalarWithPredication( Instruction *I, ElementCount VF) const { if (!isPredicatedInst(I)) return false; // Do we have a non-scalar lowering for this predicated // instruction? No - it is scalar with predication. switch(I->getOpcode()) { default: return true; case Instruction::Call: return !VFDatabase::hasMaskedVariant(*(cast(I)), VF); case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); Type *VTy = Ty; if (VF.isVector()) VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || TTI.isLegalMaskedGather(VTy, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: { // We have the option to use the safe-divisor idiom to avoid predication. // The cost based decision here will always select safe-divisor for // scalable vectors as scalarization isn't legal. const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); } } } bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Can we prove this instruction is safe to unconditionally execute? // If not, we must use some form of predication. switch(I->getOpcode()) { default: return false; case Instruction::Load: case Instruction::Store: { if (!Legal->isMaskRequired(I)) return false; // When we know the load's address is loop invariant and the instruction // in the original scalar loop was unconditionally executed then we // don't need to mark it as a predicated instruction. Tail folding may // introduce additional predication, but we're guaranteed to always have // at least one active lane. We call Legal->blockNeedsPredication here // because it doesn't query tail-folding. For stores, we need to prove // both speculation safety (which follows from the same argument as loads), // but also must prove the value being stored is correct. The easiest // form of the later is to require that all values stored are the same. if (Legal->isInvariant(getLoadStorePointerOperand(I)) && (isa(I) || (isa(I) && TheLoop->isLoopInvariant(cast(I)->getValueOperand()))) && !Legal->blockNeedsPredication(I->getParent())) return false; return true; } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: // TODO: We can use the loop-preheader as context point here and get // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); case Instruction::Call: return Legal->isMaskRequired(I); } } std::pair LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, ElementCount VF) const { assert(I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem); assert(!isSafeToSpeculativelyExecute(I)); const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // Scalarization isn't legal for scalable vector types InstructionCost ScalarizationCost = InstructionCost::getInvalid(); if (!VF.isScalable()) { // Get the scalarization cost and scale this amount by the probability of // executing the predicated block. If the instruction is not predicated, // we fall through to the next case. ScalarizationCost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. ScalarizationCost += VF.getKnownMinValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. ScalarizationCost += VF.getKnownMinValue() * TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); } InstructionCost SafeDivisorCost = 0; auto *VecTy = ToVectorTy(I->getType(), VF); // The cost of the select guard to ensure all lanes are well defined // after we speculate above any internal control flow. SafeDivisorCost += TTI.getCmpSelInstrCost( Instruction::Select, VecTy, ToVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); auto Op2Info = TTI.getOperandInfo(Op2); if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); SafeDivisorCost += TTI.getArithmeticInstrCost( I->getOpcode(), VecTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, I); return {ScalarizationCost, SafeDivisorCost}; } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); auto *Group = getInterleavedAccessGroup(I); assert(Group && "Must have a group."); // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = getLoadStoreType(I); if (hasIrregularType(ScalarTy, DL)) return false; // If the group involves a non-integral pointer, we may not be able to // losslessly cast all values to a common type. unsigned InterleaveFactor = Group->getFactor(); bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); for (unsigned i = 0; i < InterleaveFactor; i++) { Instruction *Member = Group->getMember(i); if (!Member) continue; auto *MemberTy = getLoadStoreType(Member); bool MemberNI = DL.isNonIntegralPointerType(MemberTy); // Don't coerce non-integral pointers to integers or vice versa. if (MemberNI != ScalarNI) { // TODO: Consider adding special nullptr value case here return false; } else if (MemberNI && ScalarNI && ScalarTy->getPointerAddressSpace() != MemberTy->getPointerAddressSpace()) { return false; } } // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps // (either a gap at the end of a load-access that may result in a speculative // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = blockNeedsPredicationForAnyReason(I->getParent()) && Legal->isMaskRequired(I); bool LoadAccessWithGapsRequiresEpilogMasking = isa(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); bool StoreAccessWithGapsRequiresMasking = isa(I) && (Group->getNumMembers() < Group->getFactor()); if (!PredicatedAccessRequiresMasking && !LoadAccessWithGapsRequiresEpilogMasking && !StoreAccessWithGapsRequiresMasking) return true; // If masked interleaving is required, we expect that the user/target had // enabled it, because otherwise it either wouldn't have been created or // it should have been invalidated by the CostModel. assert(useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."); if (Group->isReverse()) return false; auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); return isa(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) : TTI.isLegalMaskedStore(Ty, Alignment); } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. assert((isa(I)) && "Invalid memory instruction"); auto *Ptr = getLoadStorePointerOperand(I); auto *ScalarTy = getLoadStoreType(I); // In order to be widened, the pointer should be consecutive, first of all. if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) return false; // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); if (hasIrregularType(ScalarTy, DL)) return false; return true; } void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. assert(VF.isVector() && !Uniforms.contains(VF) && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll // not analyze again. Uniforms.count(VF) will return 1. Uniforms[VF].clear(); // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after // vectorization. // Global values, params and instructions outside of current loop are out of // scope. auto isOutOfScope = [&](Value *V) -> bool { Instruction *I = dyn_cast(V); return (!I || !TheLoop->contains(I)); }; // Worklist containing uniform instructions demanding lane 0. SetVector Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); // Add uniform instructions demanding lane 0 to the worklist. Instructions // that are scalar with predication must not be considered uniform after // vectorization, because that would create an erroneous replicating region // where only a single instance out of VF should be formed. // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { if (isOutOfScope(I)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " << *I << "\n"); return; } if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; } LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); Worklist.insert(I); }; // Start with the conditional branch. If the branch condition is an // instruction contained in the loop that is only used by the branch, it is // uniform. auto *Cmp = dyn_cast(Latch->getTerminator()->getOperand(0)); if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); auto PrevVF = VF.divideCoefficientBy(2); // Return true if all lanes perform the same memory operation, and we can // thus chose to execute only one. auto isUniformMemOpUse = [&](Instruction *I) { // If the value was already known to not be uniform for the previous // (smaller VF), it cannot be uniform for the larger VF. if (PrevVF.isVector()) { auto Iter = Uniforms.find(PrevVF); if (Iter != Uniforms.end() && !Iter->second.contains(I)) return false; } if (!Legal->isUniformMemOp(*I, VF)) return false; if (isa(I)) // Loading the same address always produces the same result - at least // assuming aliasing and ordering which have already been checked. return true; // Storing the same value on every iteration. return TheLoop->isLoopInvariant(cast(I)->getValueOperand()); }; auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); if (isUniformMemOpUse(I)) return true; return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); }; // Returns true if Ptr is the pointer operand of a memory access instruction // I, I is known to not require scalarization, and the pointer is not also // stored. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { if (isa(I) && I->getOperand(0) == Ptr) return false; return getLoadStorePointerOperand(I) == Ptr && (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); }; // Holds a list of values which are known to have at least one uniform use. // Note that there may be other uses which aren't uniform. A "uniform use" // here is something which only demands lane 0 of the unrolled iterations; // it does not imply that all lanes produce the same value (e.g. this is not // the usual meaning of uniform) SetVector HasUniformUse; // Scan the loop for instructions which are either a) known to have only // lane 0 demanded or b) are uses which demand only lane 0 of their operand. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (IntrinsicInst *II = dyn_cast(&I)) { switch (II->getIntrinsicID()) { case Intrinsic::sideeffect: case Intrinsic::experimental_noalias_scope_decl: case Intrinsic::assume: case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: if (TheLoop->hasLoopInvariantOperands(&I)) addToWorklistIfAllowed(&I); break; default: break; } } // ExtractValue instructions must be uniform, because the operands are // known to be loop-invariant. if (auto *EVI = dyn_cast(&I)) { assert(isOutOfScope(EVI->getAggregateOperand()) && "Expected aggregate value to be loop invariant"); addToWorklistIfAllowed(EVI); continue; } // If there's no pointer operand, there's nothing to do. auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; if (isUniformMemOpUse(&I)) addToWorklistIfAllowed(&I); if (isVectorizedMemAccessUse(&I, Ptr)) HasUniformUse.insert(Ptr); } // Add to the worklist any operands which have *only* uniform (e.g. lane 0 // demanding) users. Since loops are assumed to be in LCSSA form, this // disallows uses outside the loop as well. for (auto *V : HasUniformUse) { if (isOutOfScope(V)) continue; auto *I = cast(V); auto UsersAreMemAccesses = llvm::all_of(I->users(), [&](User *U) -> bool { return isVectorizedMemAccessUse(cast(U), V); }); if (UsersAreMemAccesses) addToWorklistIfAllowed(I); } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures // a uniform instruction will only be used by uniform instructions. unsigned idx = 0; while (idx != Worklist.size()) { Instruction *I = Worklist[idx++]; for (auto *OV : I->operand_values()) { // isOutOfScope operands cannot be uniform instructions. if (isOutOfScope(OV)) continue; // First order recurrence Phi's should typically be considered // non-uniform. auto *OP = dyn_cast(OV); if (OP && Legal->isFixedOrderRecurrence(OP)) continue; // If all the users of the operand are uniform, then add the // operand into the uniform worklist. auto *OI = cast(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast(U); return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); })) addToWorklistIfAllowed(OI); } } // For an instruction to be added into Worklist above, all its users inside // the loop should also be in Worklist. However, this condition cannot be // true for phi nodes that form a cyclic dependence. We must process phi // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. for (const auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); // Determine if all users of the induction variable are uniform after // vectorization. auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || isVectorizedMemAccessUse(I, Ind); }); if (!UniformInd) continue; // Determine if all users of the induction variable update instruction are // uniform after vectorization. auto UniformIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || isVectorizedMemAccessUse(I, IndUpdate); }); if (!UniformIndUpdate) continue; // The induction variable and its update instruction will remain uniform. addToWorklistIfAllowed(Ind); addToWorklistIfAllowed(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationCostModel::runtimeChecksRequired() { LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } if (!PSE.getPredicate().isAlwaysTrue()) { reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", "runtime SCEV checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { reportVectorizationFailure("Runtime stride check for small trip count", "runtime stride == 1 checks needed. Enable vectorization of " "this loop without such check by compiling with -Os/-Oz", "CantVersionLoopWithOptForSize", ORE, TheLoop); return true; } return false; } ElementCount LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) return ElementCount::getScalable(0); if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", "ScalableVectorizationDisabled", ORE, TheLoop); return ElementCount::getScalable(0); } LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); auto MaxScalableVF = ElementCount::getScalable( std::numeric_limits::max()); // Test that the loop-vectorizer can legalize all operations for this MaxVF. // FIXME: While for scalable vectors this is currently sufficient, this should // be replaced by a more detailed mechanism that filters out specific VFs, // instead of invalidating vectorization for a whole set of VFs based on the // MaxVF. // Disable scalable vectorization if the loop contains unsupported reductions. if (!canVectorizeReductions(MaxScalableVF)) { reportVectorizationInfo( "Scalable vectorization not supported for the reduction " "operations found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); return ElementCount::getScalable(0); } // Disable scalable vectorization if the loop contains any instructions // with element types not supported for scalable vectors. if (any_of(ElementTypesInLoop, [&](Type *Ty) { return !Ty->isVoidTy() && !this->TTI.isElementTypeLegalForScalableVector(Ty); })) { reportVectorizationInfo("Scalable vectorization is not supported " "for all element types found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); return ElementCount::getScalable(0); } if (Legal->isSafeForAnyVectorWidth()) return MaxScalableVF; // Limit MaxScalableVF by the maximum safe dependence distance. if (std::optional MaxVScale = getMaxVScale(*TheFunction, TTI)) MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); else MaxScalableVF = ElementCount::getScalable(0); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " "unfeasible.", "ScalableVFUnfeasible", ORE, TheLoop); return MaxScalableVF; } FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); // Get the maximum safe dependence distance in bits computed by LAA. // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). unsigned MaxSafeElements = llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF << ".\n"); // First analyze the UserVF, fall back if the UserVF should be ignored. if (UserVF) { auto MaxSafeUserVF = UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { // If `VF=vscale x N` is safe, then so is `VF=N` if (UserVF.isScalable()) return FixedScalableVFPair( ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); else return UserVF; } assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it // is better to ignore the hint and let the compiler choose a suitable VF. if (!UserVF.isScalable()) { LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF << " is unsafe, clamping to max safe VF=" << MaxSafeFixedVF << ".\n"); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", TheLoop->getStartLoc(), TheLoop->getHeader()) << "User-specified vectorization factor " << ore::NV("UserVectorizationFactor", UserVF) << " is unsafe, clamping to maximum safe vectorization factor " << ore::NV("VectorizationFactor", MaxSafeFixedVF); }); return MaxSafeFixedVF; } if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF << " is ignored because scalable vectors are not " "available.\n"); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", TheLoop->getStartLoc(), TheLoop->getHeader()) << "User-specified vectorization factor " << ore::NV("UserVectorizationFactor", UserVF) << " is ignored because the target does not support scalable " "vectors. The compiler will pick a more suitable value."; }); } else { LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF << " is unsafe. Ignoring scalable UserVF.\n"); ORE->emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", TheLoop->getStartLoc(), TheLoop->getHeader()) << "User-specified vectorization factor " << ore::NV("UserVectorizationFactor", UserVF) << " is unsafe. Ignoring the hint to let the compiler pick a " "more suitable value."; }); } } LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"); } return Result; } FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. reportVectorizationFailure( "Not inserting runtime ptr check for divergent target", "runtime pointer checks needed. Not enabled for divergent target", "CantVersionLoopWithDivergentTarget", ORE, TheLoop); return FixedScalableVFPair::getNone(); } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); if (TC == 1) { reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", "SingleIterationLoop", ORE, TheLoop); return FixedScalableVFPair::getNone(); } switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: [[fallthrough]]; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize case CM_ScalarEpilogueNotAllowedOptSize: if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) LLVM_DEBUG( dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); else LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " << "count.\n"); // Bail if runtime checks are required, which are not good when optimising // for size. if (runtimeChecksRequired()) return FixedScalableVFPair::getNone(); break; } // The only loops we can vectorize without a scalar epilogue, are loops with // a bottom-test and a single exiting block. We'd have to handle the fact // that not every instruction executes on the last iteration. This will // require a lane mask which varies through the vector loop body. (TODO) if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return computeFeasibleMaxVF(TC, UserVF, false); } return FixedScalableVFPair::getNone(); } // Now try the tail folding // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. if (!useMaskedInterleavedAccesses(TTI)) { assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"); // Note: There is no need to invalidate any cost modeling decisions here, as // non where taken so far. InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we choose. std::optional MaxPowerOf2RuntimeVF = MaxFactors.FixedVF.getFixedValue(); if (MaxFactors.ScalableVF) { std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { MaxPowerOf2RuntimeVF = std::max( *MaxPowerOf2RuntimeVF, *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); } else MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. } if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); unsigned MaxVFtimesIC = UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); const SCEV *Rem = SE->getURemExpr( SE->applyLoopGuards(ExitCount, TheLoop), SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); if (Rem->isZero()) { // Accept MaxFixedVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxFactors; } } // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { CanFoldTailByMasking = true; return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return MaxFactors; } if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); return FixedScalableVFPair::getNone(); } if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); return FixedScalableVFPair::getNone(); } reportVectorizationFailure( "Cannot optimize for size and vectorize at the same time.", "cannot optimize for size and vectorize at the same time. " "Enable vectorization of this loop with '#pragma clang loop " "vectorize(enable)' when compiling with -Os/-Oz", "NoTailLoopWithOptForSize", ORE, TheLoop); return FixedScalableVFPair::getNone(); } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); const TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector); // Convenience function to return the minimum of two ElementCounts. auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { assert((LHS.isScalable() == RHS.isScalable()) && "Scalable flags must match"); return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; }; // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorElementCount = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), ComputeScalableMaxVF); MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " << (MaxVectorElementCount * WidestType) << " bits.\n"); if (!MaxVectorElementCount) { LLVM_DEBUG(dbgs() << "LV: The target has no " << (ComputeScalableMaxVF ? "scalable" : "fixed") << " vector registers.\n"); return ElementCount::getFixed(1); } unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); if (MaxVectorElementCount.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); WidestRegisterMinEC *= Min; } // When a scalar epilogue is required, at least one iteration of the scalar // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a // max VF that results in a dead vector loop. if (ConstTripCount > 0 && requiresScalarEpilogue(true)) ConstTripCount -= 1; if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { // If loop trip count (TC) is known at compile time there is no point in // choosing VF greater than TC (as done in the loop below). Select maximum // power of two which doesn't exceed TC. // If MaxVectorElementCount is scalable, we only fall back on a fixed VF // when the TC is less than or equal to the known number of lanes. auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " << ClampedConstTripCount << "\n"); return ElementCount::getFixed(ClampedConstTripCount); } TargetTransformInfo::RegisterKind RegKind = ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && TTI.shouldMaximizeVectorBandwidth(RegKind))) { auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorElementCount). SmallVector VFs; for (ElementCount VS = MaxVectorElementCount * 2; ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) VFs.push_back(VS); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); // Select the largest VF which doesn't require more registers than existing // ones. for (int i = RUs.size() - 1; i >= 0; --i) { bool Selected = true; for (auto &pair : RUs[i].MaxLocalUsers) { unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); if (pair.second > TargetNumRegisters) Selected = false; } if (Selected) { MaxVF = VFs[i]; break; } } if (ElementCount MinVF = TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); MaxVF = MinVF; } } // Invalidate any widening decisions we might have made, in case the loop // requires prediction (decided later), but we have already made some // load/store widening decisions. invalidateCostModelingDecisions(); } return MaxVF; } /// Convenience function that returns the value of vscale_range iff /// vscale_range.min == vscale_range.max or otherwise returns the value /// returned by the corresponding TTI method. static std::optional getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { const Function *Fn = L->getHeader()->getParent(); if (Fn->hasFnAttribute(Attribute::VScaleRange)) { auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); auto Max = Attr.getVScaleRangeMax(); if (Max && Min == Max) return Max; } return TTI.getVScaleForTuning(); } bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { // If the trip count is a known (possibly small) constant, the trip count // will be rounded up to an integer number of iterations under // FoldTailByMasking. The total cost in that case will be // VecCost*ceil(TripCount/VF). When not folding the tail, the total // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be // some extra overheads, but for the purpose of comparing the costs of // different VFs we can use this to compare the total loop-body cost // expected after vectorization. auto GetCostForTC = [MaxTripCount, this](unsigned VF, InstructionCost VectorCost, InstructionCost ScalarCost) { return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) : VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); }; auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); return RTCostA < RTCostB; } // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); if (std::optional VScale = getVScaleForTuning(OrigLoop, TTI)) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) EstimatedWidthB *= *VScale; } // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. if (A.Width.isScalable() && !B.Width.isScalable()) return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) // <=> (CostA * B.Width) < (CostB * A.Width) return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); } static void emitInvalidCostRemarks(SmallVector InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop) { if (InvalidCosts.empty()) return; // Emit a report of VFs with invalid costs in the loop. // Group the remarks per instruction, keeping the instruction order from // InvalidCosts. std::map Numbering; unsigned I = 0; for (auto &Pair : InvalidCosts) if (!Numbering.count(Pair.first)) Numbering[Pair.first] = I++; // Sort the list, first on instruction(number) then on VF. sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { if (Numbering[A.first] != Numbering[B.first]) return Numbering[A.first] < Numbering[B.first]; ElementCountComparator ECC; return ECC(A.second, B.second); }); // For a list of ordered instruction-vf pairs: // [(load, vf1), (load, vf2), (store, vf1)] // Group the instructions together to emit separate remarks for: // load (vf1, vf2) // store (vf1) auto Tail = ArrayRef(InvalidCosts); auto Subset = ArrayRef(); do { if (Subset.empty()) Subset = Tail.take_front(1); Instruction *I = Subset.front().first; // If the next instruction is different, or if there are no other pairs, // emit a remark for the collated subset. e.g. // [(load, vf1), (load, vf2))] // to emit: // remark: invalid costs for 'load' at VF=(vf, vf2) if (Subset == Tail || Tail[Subset.size()].first != I) { std::string OutString; raw_string_ostream OS(OutString); assert(!Subset.empty() && "Unexpected empty range"); OS << "Instruction with invalid costs prevented vectorization at VF=("; for (const auto &Pair : Subset) OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; OS << "):"; if (auto *CI = dyn_cast(I)) OS << " call to " << CI->getCalledFunction()->getName(); else OS << " " << I->getOpcodeName(); OS.flush(); reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); Tail = Tail.drop_front(Subset.size()); Subset = {}; } else // Grow the subset by one element Subset = Tail.take_front(Subset.size() + 1); } while (!Tail.empty()); } VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( const ElementCountSet &VFCandidates) { InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); assert(VFCandidates.count(ElementCount::getFixed(1)) && "Expected Scalar VF to be a candidate"); const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. ChosenFactor.Cost = InstructionCost::getMax(); } SmallVector InvalidCosts; for (const auto &i : VFCandidates) { // The cost for scalar VF=1 is already calculated, so ignore it. if (i.isScalar()) continue; LoopVectorizationCostModel::VectorizationCostTy C = CM.expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; if (std::optional VScale = getVScaleForTuning(OrigLoop, TTI)) AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale : Candidate.Width.getFixedValue(); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (Candidate.Cost / Width)); if (i.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"); LLVM_DEBUG(dbgs() << ".\n"); #endif if (!C.second && !ForceVectorization) { LLVM_DEBUG( dbgs() << "LV: Not considering vector loop of width " << i << " because it will not generate any vector instructions.\n"); continue; } // If profitable add it to ProfitableVF list. if (isMoreProfitable(Candidate, ScalarCost)) ProfitableVFs.push_back(Candidate); if (isMoreProfitable(Candidate, ChosenFactor)) ChosenFactor = Candidate; } emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); if (!EnableCondStoresVectorization && CM.hasPredStores()) { reportVectorizationFailure( "There are conditional stores.", "store that is conditionally executed prevents vectorization", "ConditionalStore", ORE, OrigLoop); ChosenFactor = ScalarCost; } LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); return ChosenFactor; } bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; // Phis with uses outside of the loop require special handling and are // currently unsupported. for (const auto &Entry : Legal->getInductionVars()) { // Look for uses of the value of the induction at the last iteration. Value *PostInc = Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) if (!OrigLoop->contains(cast(U))) return false; // Look for uses of penultimate value of the induction. for (User *U : Entry.first->users()) if (!OrigLoop->contains(cast(U))) return false; } // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; } bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( const ElementCount VF) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops // with vectorization factors larger than a certain value. // Allow the target to opt out entirely. if (!TTI.preferEpilogueVectorization()) return false; // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; unsigned Multiplier = 1; if (VF.isScalable()) Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( const ElementCount MainLoopVF, unsigned IC) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); return Result; } if (!CM.isScalarEpilogueAllowed()) { LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " "epilogue is allowed.\n"); return Result; } // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " "is not a supported candidate.\n"); return Result; } if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; else { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " "viable.\n"); return Result; } } if (OrigLoop->getHeader()->getParent()->hasOptSize() || OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; } if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; } // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. ElementCount EstimatedRuntimeVF = MainLoopVF; if (MainLoopVF.isScalable()) { EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); if (std::optional VScale = getVScaleForTuning(OrigLoop, TTI)) EstimatedRuntimeVF *= *VScale; } ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) continue; // Skip candidate VFs with widths >= the estimate runtime VF (scalable // vectors) or the VF of the main loop (fixed vectors). if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) continue; // If NextVF is greater than the number of remaining iterations, the // epilogue loop would be dead. Skip such factors. if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { // TODO: extend to support scalable VFs. if (!RemainingIterations) { const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); RemainingIterations = SE.getURemExpr( TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); } if (SE.isKnownPredicate( CmpInst::ICMP_UGT, SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), RemainingIterations)) continue; } if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) Result = NextVF; } if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " << Result.Width << "\n"); return Result; } std::pair LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); // For in-loop reductions, no element types are added to ElementTypesInLoop // if there are no loads/stores in the loop. In this case, check through the // reduction variables to determine the maximum width. if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { // Reset MaxWidth so that we can find the smallest type used by recurrences // in the loop. MaxWidth = -1U; for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; // When finding the min width used by the recurrence we need to account // for casts on the input operands of the recurrence. MaxWidth = std::min( MaxWidth, std::min( RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), RdxDesc.getRecurrenceType()->getScalarSizeInBits())); } } else { for (Type *T : ElementTypesInLoop) { MinWidth = std::min( MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); MaxWidth = std::max( MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); } } return {MinWidth, MaxWidth}; } void LoopVectorizationCostModel::collectElementTypesForWidening() { ElementTypesInLoop.clear(); // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the loop. for (Instruction &I : BB->instructionsWithoutDebug()) { Type *T = I.getType(); // Skip ignored values. if (ValuesToIgnore.count(&I)) continue; // Only examine Loads, Stores and PHINodes. if (!isa(I) && !isa(I) && !isa(I)) continue; // Examine PHI nodes that are reduction variables. Update the type to // account for the recurrence type. if (auto *PN = dyn_cast(&I)) { if (!Legal->isReductionVariable(PN)) continue; const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(PN)->second; if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), TargetTransformInfo::ReductionFlags())) continue; T = RdxDesc.getRecurrenceType(); } // Examine the stored values. if (auto *ST = dyn_cast(&I)) T = ST->getValueOperand()->getType(); assert(T->isSized() && "Expected the load/store/recurrence type to be sized"); ElementTypesInLoop.insert(T); } } } unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict // at this level. For example, frontend pressure (on decode or fetch) due to // code size, or the number and capabilities of the execution ports. // // We use the following heuristics to select the interleave count: // 1. If the code has reductions, then we interleave to break the cross // iteration dependency. // 2. If the loop is really small, then we interleave to reduce the loop // overhead. // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. if (!isScalarEpilogueAllowed()) return 1; // We used the distance for the interleave count. if (!Legal->isSafeForAnyVectorWidth()) return 1; auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); // Do not interleave loops with a relatively small known or estimated trip // count. But we will interleave when InterleaveSmallLoopScalarReduction is // enabled, and the code has scalar reductions(HasReductions && VF = 1), // because with the above conditions interleaving can expose ILP and break // cross iteration dependences for reductions. if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) return 1; // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { LoopCost = expectedCost(VF).first; assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. if (LoopCost == 0) return 1; } RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto& pair : R.MaxLocalUsers) { pair.second = std::max(pair.second, 1U); } // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available // registers. These registers are used by all of the interleaved instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances // fit without causing spills. All of this is rounded down if necessary to be // a power of two. We want power of two interleave count to simplify any // addressing operations or alignment considerations. // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. unsigned IC = UINT_MAX; for (auto& pair : R.MaxLocalUsers) { unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); if (VF.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumVectorRegs; } unsigned MaxLocalUsers = pair.second; unsigned LoopInvariantRegs = 0; if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); // Don't count the induction variable as interleaved. if (EnableIndVarRegisterHeur) { TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / std::max(1U, (MaxLocalUsers - 1))); } IC = std::min(IC, TmpIC); } // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the max. if (VF.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF, provided it // is at least 1. // // For scalable vectors we can't know if interleaving is beneficial. It may // not be beneficial for small loops if none of the lanes in the second vector // iterations is enabled. However, for larger loops, there is likely to be a // similar benefit as for fixed-width vectors. For now, we choose to leave // the InterleaveCount as if vscale is '1', although if some information about // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { MaxInterleaveCount = std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); // Make sure MaxInterleaveCount is greater than 0. MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } assert(MaxInterleaveCount > 0 && "Maximum interleave count must be greater than 0"); // Clamp the calculated IC to be between the 1 and the max interleave count // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; else // Make sure IC is greater than 0. IC = std::max(1u, IC); assert(IC > 0 && "Interleave count must be greater than 0."); // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. if (VF.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } // For any scalar loop that either requires runtime checks or predication we // are better off leaving this to the unroller. Note that if we've already // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' << "LV: VF is " << VF << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); if (!ScalarInterleavingRequiresRuntimePointerCheck && !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor( SmallLoopCost / *LoopCost.getValue())); // Interleave until store/load ports (estimated by max interleave count) are // saturated. unsigned NumStores = Legal->getNumStores(); unsigned NumLoads = Legal->getNumLoads(); unsigned StoresIC = IC / (NumStores ? NumStores : 1); unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); // There is little point in interleaving for reductions containing selects // and compares when VF=1 since it may just create more overhead than it's // worth for loops with small trip counts. This is because we still have to // do the final reduction after the loop. bool HasSelectCmpReductions = HasReductions && any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; return RecurrenceDescriptor::isSelectCmpRecurrenceKind( RdxDesc.getRecurrenceKind()); }); if (HasSelectCmpReductions) { LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); return 1; } // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop // we're interleaving is inside another loop. For tree-wise reductions // set the limit to 2, and for ordered reductions it's best to disable // interleaving entirely. if (HasReductions && TheLoop->getLoopDepth() > 1) { bool HasOrderedReductions = any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; return RdxDesc.isOrdered(); }); if (HasOrderedReductions) { LLVM_DEBUG( dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); return 1; } unsigned F = static_cast(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); LoadsIC = std::min(LoadsIC, F); } if (EnableLoadStoreRuntimeInterleave && std::max(StoresIC, LoadsIC) > SmallIC) { LLVM_DEBUG( dbgs() << "LV: Interleaving to saturate store or load ports.\n"); return std::max(StoresIC, LoadsIC); } // If there are scalar reductions and TTI has enabled aggressive // interleaving for reductions, we will interleave to expose ILP. if (InterleaveSmallLoopScalarReduction && VF.isScalar() && AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC // to satisfy the rare situation when resources are too limited. return std::max(IC / 2, SmallIC); } else { LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); return SmallIC; } } // Interleave if this is a large loop (small loops are already dealt with by // this point) that could benefit from interleaving. if (AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); return 1; } SmallVector LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and // assign a number to each instruction. We use RPO to ensure that defs are // met before their users. We assume that each instruction that has in-loop // users starts an interval. We record every time that an in-loop value is // used, so we have a list of the first and last occurrences of each // instruction. Next, we transpose this data structure into a multi map that // holds the list of intervals that *end* at a specific location. This multi // map allows us to perform a linear search. We scan the instructions linearly // and record each time that a new interval starts, by placing it in a set. // If we find this value in the multi-map then we remove it from the set. // The max register usage is the maximum size of the set. // We also search for instructions that are defined outside the loop, but are // used inside the loop. We need this number separately from the max-interval // usage number because when we unroll, loop-invariant values do not take // more register. LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); RegisterUsage RU; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the // instruction that is the key. using IntervalMap = DenseMap; // Maps instruction to its index. SmallVector IdxToInstr; // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. SmallPtrSet Ends; // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-instruction values such as arguments and // constants). SmallSetVector LoopInvariants; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { for (Instruction &I : BB->instructionsWithoutDebug()) { IdxToInstr.push_back(&I); // Save the end location of each USE. for (Value *U : I.operands()) { auto *Instr = dyn_cast(U); // Ignore non-instruction values such as arguments, constants, etc. // FIXME: Might need some motivation why these values are ignored. If // for example an argument is used inside the loop it will increase the // register pressure (so shouldn't we add it to LoopInvariants). if (!Instr) continue; // If this instruction is outside the loop then record it and continue. if (!TheLoop->contains(Instr)) { LoopInvariants.insert(Instr); continue; } // Overwrite previous end points. EndPoint[Instr] = IdxToInstr.size(); Ends.insert(Instr); } } } // Saves the list of intervals that end with the index in 'key'. using InstrList = SmallVector; DenseMap TransposeEnds; // Transpose the EndPoints to a list of values that end at each index. for (auto &Interval : EndPoint) TransposeEnds[Interval.second].push_back(Interval.first); SmallPtrSet OpenIntervals; SmallVector RUs(VFs.size()); SmallVector, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); const auto &TTICapture = TTI; auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { Instruction *I = IdxToInstr[i]; // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (Instruction *ToRemove : List) OpenIntervals.erase(ToRemove); // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; // Skip ignored values. if (ValuesToIgnore.count(I)) continue; // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { // Count the number of registers used, per register class, given all open // intervals. // Note that elements in this SmallMapVector will be default constructed // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if // there is no previous entry for ClassID. SmallMapVector RegUsage; if (VFs[j].isScalar()) { for (auto *Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); // FIXME: The target might use more than one register for the type // even in the scalar case. RegUsage[ClassID] += 1; } } else { collectUniformsAndScalars(VFs[j]); for (auto *Inst : OpenIntervals) { // Skip ignored values for VF > 1. if (VecValuesToIgnore.count(Inst)) continue; if (isScalarAfterVectorization(Inst, VFs[j])) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); // FIXME: The target might use more than one register for the type // even in the scalar case. RegUsage[ClassID] += 1; } else { unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); } } } for (auto& pair : RegUsage) { auto &Entry = MaxUsages[j][pair.first]; Entry = std::max(Entry, pair.second); } } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { // Note that elements in this SmallMapVector will be default constructed // as 0. So we can use "Invariant[ClassID] += n" in the code below even if // there is no previous entry for ClassID. SmallMapVector Invariant; for (auto *Inst : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. bool IsScalar = all_of(Inst->users(), [&](User *U) { auto *I = cast(U); return TheLoop != LI->getLoopFor(I->getParent()) || isScalarAfterVectorization(I, VFs[i]); }); ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; unsigned ClassID = TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; RUs[i] = RU; } return RUs; } bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such // operations, except where previously deployed legality hack allowed // using very low cost values. This is to avoid regressions coming simply // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. assert((isPredicatedInst(I)) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); } void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's // not profitable to scalarize any instructions, the presence of VF in the // map will indicate that we've analyzed it already. ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; PredicatedBBsAfterVectorization[VF].clear(); // Find all the instructions that are scalar with predication in the loop and // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization[VF].insert(BB); } } } InstructionCost LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the // vector version cost the same. InstructionCost Discount = 0; // Holds instructions to analyze. The instructions we visit are mapped in // ScalarCosts. Those instructions are the ones that would be scalarized if // we find that the scalar version costs less. SmallVector Worklist; // Returns true if the given instruction can be scalarized. auto canBeScalarized = [&](Instruction *I) -> bool { // We only attempt to scalarize instructions forming a single-use chain // from the original predicated block that would otherwise be vectorized. // Although not strictly necessary, we give up on instructions we know will // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || isScalarAfterVectorization(I, VF)) return false; // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. if (isScalarWithPredication(I, VF)) return false; // If any of the instruction's operands are uniform after vectorization, // the instruction cannot be scalarized. This prevents, for example, a // masked load from being scalarized. // // We assume we will only emit a value for lane zero of an instruction // marked uniform after vectorization, rather than VF identical values. // Thus, if we scalarize an instruction that uses a uniform, we would // create uses of values corresponding to the lanes we aren't emitting code // for. This behavior can be changed by allowing getScalarValue to clone // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) if (isUniformAfterVectorization(J, VF)) return false; // Otherwise, we can scalarize the instruction. return true; }; // Compute the expected cost discount from scalarizing the entire expression // feeding the predicated instruction. We currently only consider expressions // that are single-use instruction chains. Worklist.push_back(PredInst); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); // If we've already analyzed the instruction, there's nothing to do. if (ScalarCosts.contains(I)) continue; // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. InstructionCost VectorCost = getInstructionCost(I, VF).first; // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. InstructionCost ScalarCost = VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, /*Extract*/ false, CostKind); ScalarCost += VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } // Compute the scalarization overhead of needed extractelement // instructions. For each of the instruction's operands, if the operand can // be scalarized, add it to the worklist; otherwise, account for the // overhead. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) { assert(VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, /*Extract*/ true, CostKind); } } // Scale the total scalar cost by block probability. ScalarCost /= getReciprocalPredBlockProb(); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. Discount += VectorCost - ScalarCost; ScalarCosts[I] = ScalarCost; } return Discount; } LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost( ElementCount VF, SmallVectorImpl *Invalid) { VectorizationCostTy Cost; // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. if (ValuesToIgnore.count(&I) || (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); // Check if we should override the cost. if (C.first.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) C.first = InstructionCost(ForceTargetInstructionCost); // Keep a list of instructions with invalid costs. if (Invalid && !C.first.isValid()) Invalid->emplace_back(&I, VF); BlockCost.first += C.first; BlockCost.second |= C.second; LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " << VF << " For instruction: " << I << '\n'); } // If we are vectorizing a predicated block, it will have been // if-converted. This means that the block's instructions (aside from // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute // the predicated block, if it is an if-else block. Thus, scale the block's // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. if (VF.isScalar() && Legal->blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; Cost.second |= BlockCost.second; } return Cost; } /// Gets Address Access SCEV after verifying that the access pattern /// is loop invariant except the induction variable dependence. /// /// This SCEV can be sent to the Target in order to estimate the address /// calculation cost. static const SCEV *getAddressAccessSCEV( Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop) { auto *Gep = dyn_cast(Ptr); if (!Gep) return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. auto SE = PSE.getSE(); unsigned NumOperands = Gep->getNumOperands(); for (unsigned i = 1; i < NumOperands; ++i) { Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) return nullptr; } // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. return PSE.getSCEV(Ptr); } InstructionCost LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, ElementCount VF) { assert(VF.isVector() && "Scalarization cost of instruction implies vectorization."); if (VF.isScalable()) return InstructionCost::getInvalid(); Type *ValTy = getLoadStoreType(I); auto SE = PSE.getSE(); unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` // that it is being called from this specific place. // Figure out whether the access is strided and get the stride value // if it's known in compile time const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. InstructionCost Cost = VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; const Align Alignment = getLoadStoreAlignment(I); Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS, CostKind); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. Cost += getScalarizationOverhead(I, VF, CostKind); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), /*Insert=*/false, /*Extract=*/true, CostKind); Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; } return Cost; } InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); } else { TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind, OpInfo, I); } bool Reverse = ConsecutiveStride < 0; if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, std::nullopt, CostKind, 0); return Cost; } InstructionCost LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { assert(Legal->isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isa(I)) { return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast(I); bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, CostKind, VF.getKnownMinValue() - 1)); } InstructionCost LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); const Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, TargetTransformInfo::TCK_RecipThroughput, I); } InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in the interleaved group. SmallVector Indices; for (unsigned IF = 0; IF < InterleaveFactor; IF++) if (Group->getMember(IF)) Indices.push_back(IF); // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, std::nullopt, CostKind, 0); } return Cost; } std::optional LoopVectorizationCostModel::getReductionPatternCost( Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { using namespace llvm::PatternMatch; // Early exit for no inloop reductions if (InLoopReductionChains.empty() || VF.isScalar() || !isa(Ty)) return std::nullopt; auto *VectorTy = cast(Ty); // We are looking for a pattern of, and finding the minimal acceptable cost: // reduce(mul(ext(A), ext(B))) or // reduce(mul(A, B)) or // reduce(ext(A)) or // reduce(A). // The basic idea is that we walk down the tree to do that, finding the root // reduction instruction in InLoopReductionImmediateChains. From there we find // the pattern of mul/ext and test the cost of the entire pattern vs the cost // of the components. If the reduction cost is lower then we return it for the // reduction instruction and 0 for the other instructions in the pattern. If // it is not we return an invalid cost specifying the orignal cost method // should be used. Instruction *RetI = I; if (match(RetI, m_ZExtOrSExt(m_Value()))) { if (!RetI->hasOneUser()) return std::nullopt; RetI = RetI->user_back(); } if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && RetI->user_back()->getOpcode() == Instruction::Add) { RetI = RetI->user_back(); } // Test if the found instruction is a reduction, and if not return an invalid // cost specifying the parent to use the original cost modelling. if (!InLoopReductionImmediateChains.count(RetI)) return std::nullopt; // Find the reduction this chain is a part of and calculate the basic cost of // the reduction on its own. Instruction *LastChain = InLoopReductionImmediateChains[RetI]; Instruction *ReductionPhi = LastChain; while (!isa(ReductionPhi)) ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(cast(ReductionPhi))->second; InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); // For a call to the llvm.fmuladd intrinsic we need to add the cost of a // normal fmul instruction to the cost of the fadd reduction. if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) BaseCost += TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. if (useOrderedReductions(RdxDesc)) return BaseCost; // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. Instruction *RedOp = RetI->getOperand(1) == LastChain ? dyn_cast(RetI->getOperand(0)) : dyn_cast(RetI->getOperand(1)); VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); Instruction *Op0, *Op1; if (RedOp && RdxDesc.getOpcode() == Instruction::Add && match(RedOp, m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { // Matched reduce.add(ext(mul(ext(A), ext(B))) // Note that the extend opcodes need to all match, or if A==B they will have // been converted to zext(mul(sext(A), sext(A))) as it is known positive, // which is equally fine. bool IsUnsigned = isa(Op0); auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); auto *MulType = VectorType::get(Op0->getType(), VectorTy); InstructionCost ExtCost = TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, TTI::CastContextHint::None, CostKind, Op0); InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); InstructionCost Ext2Cost = TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, TTI::CastContextHint::None, CostKind, RedOp); InstructionCost RedCost = TTI.getMulAccReductionCost( IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) return I == RetI ? RedCost : 0; } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && !TheLoop->isLoopInvariant(RedOp)) { // Matched reduce(ext(A)) bool IsUnsigned = isa(RedOp); auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); InstructionCost RedCost = TTI.getExtendedReductionCost( RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, RdxDesc.getFastMathFlags(), CostKind); InstructionCost ExtCost = TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None, CostKind, RedOp); if (RedCost.isValid() && RedCost < BaseCost + ExtCost) return I == RetI ? RedCost : 0; } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { bool IsUnsigned = isa(Op0); Type *Op0Ty = Op0->getOperand(0)->getType(); Type *Op1Ty = Op1->getOperand(0)->getType(); Type *LargestOpTy = Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty : Op0Ty; auto *ExtType = VectorType::get(LargestOpTy, VectorTy); // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of // different sizes. We take the largest type as the ext to reduce, and add // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). InstructionCost ExtCost0 = TTI.getCastInstrCost( Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), TTI::CastContextHint::None, CostKind, Op0); InstructionCost ExtCost1 = TTI.getCastInstrCost( Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), TTI::CastContextHint::None, CostKind, Op1); InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; ExtraExtCost = TTI.getCastInstrCost( ExtraExtOp->getOpcode(), ExtType, VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), TTI::CastContextHint::None, CostKind, ExtraExtOp); } if (RedCost.isValid() && (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) return I == RetI ? RedCost : 0; } else if (!match(I, m_ZExtOrSExt(m_Value()))) { // Matched reduce.add(mul()) InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; } } return I == RetI ? std::optional(BaseCost) : std::nullopt; } InstructionCost LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF.isScalar()) { Type *ValTy = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, TTI::TCK_RecipThroughput, OpInfo, I); } return getWideningCost(I, VF); } LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) VF = ElementCount::getFixed(1); if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) return VectorizationCostTy( (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.getKnownMinValue()), false); } Type *VectorTy; InstructionCost C = getInstructionCost(I, VF, VectorTy); bool TypeNotScalarized = false; if (VF.isVector() && VectorTy->isVectorTy()) { if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { if (VF.isScalable()) // is assumed to be profitable over iN because // scalable registers are a distinct register class from scalar ones. // If we ever find a target which wants to lower scalable vectors // back to scalars, we'll need to update this code to explicitly // ask TTI about the register class uses for each part. TypeNotScalarized = NumParts <= VF.getKnownMinValue(); else TypeNotScalarized = NumParts < VF.getKnownMinValue(); } else C = InstructionCost::getInvalid(); } return VectorizationCostTy(C, TypeNotScalarized); } InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. if (VF.isScalable()) return InstructionCost::getInvalid(); if (VF.isScalar()) return 0; InstructionCost Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), /*Insert*/ true, /*Extract*/ false, CostKind); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) return Cost; // Some targets support efficient element stores. if (isa(I) && TTI.supportsEfficientVectorElementLoadStore()) return Cost; // Collect operands to consider. CallInst *CI = dyn_cast(I); Instruction::op_range Ops = CI ? CI->args() : I->operands(); // Skip operands that do not require extraction/scalarization and do not incur // any overhead. SmallVector Tys; for (auto *V : filterExtractingOperands(Ops, VF)) Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( filterExtractingOperands(Ops, VF), Tys, CostKind); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { if (VF.isScalar()) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the old loop. for (Instruction &I : *BB) { Value *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; // TODO: We should generate better code and update the cost model for // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). if (isa(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; if (Legal->isUniformMemOp(I, VF)) { auto isLegalToScalarize = [&]() { if (!VF.isScalable()) // Scalarization of fixed length vectors "just works". return true; // We have dedicated lowering for unpredicated uniform loads and // stores. Note that even with tail folding we know that at least // one lane is active (i.e. generalized predication is not possible // here), and the logic below depends on this fact. if (!foldTailByMasking()) return true; // For scalable vectors, a uniform memop load is always // uniform-by-parts and we know how to scalarize that. if (isa(I)) return true; // A uniform store isn't neccessarily uniform-by-part // and we can't assume scalarization. auto &SI = cast(I); return TheLoop->isLoopInvariant(SI.getValueOperand()); }; const InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract // FIXME: This cost is a significant under-estimate for tail folded // memory ops. const InstructionCost ScalarizationCost = isLegalToScalarize() ? getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); // Choose better solution for the current VF, Note that Invalid // costs compare as maximumal large. If both are invalid, we get // scalable invalid which signals a failure and a vectorization abort. if (GatherScatterCost < ScalarizationCost) setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); else setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); continue; } // We assume that widening is the best solution when possible. if (memoryInstructionCanBeWidened(&I, VF)) { InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); int ConsecutiveStride = Legal->isConsecutivePtr( getLoadStoreType(&I), getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; setWideningDecision(&I, VF, Decision, Cost); continue; } // Choose between Interleaving, Gather/Scatter or Scalarization. InstructionCost InterleaveCost = InstructionCost::getInvalid(); unsigned NumAccesses = 1; if (isAccessInterleaved(&I)) { auto Group = getInterleavedAccessGroup(&I); assert(Group && "Fail to get an interleaved access group."); // Make one decision for the whole group. if (getWideningDecision(&I, VF) != CM_Unknown) continue; NumAccesses = Group->getNumMembers(); if (interleavedAccessCanBeWidened(&I, VF)) InterleaveCost = getInterleaveGroupCost(&I, VF); } InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); InstructionCost ScalarizationCost = getMemInstScalarizationCost(&I, VF) * NumAccesses; // Choose better solution for the current VF, // write down this decision and use it during vectorization. InstructionCost Cost; InstWidening Decision; if (InterleaveCost <= GatherScatterCost && InterleaveCost < ScalarizationCost) { Decision = CM_Interleave; Cost = InterleaveCost; } else if (GatherScatterCost < ScalarizationCost) { Decision = CM_GatherScatter; Cost = GatherScatterCost; } else { Decision = CM_Scalarize; Cost = ScalarizationCost; } // If the instructions belongs to an interleave group, the whole group // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. if (auto Group = getInterleavedAccessGroup(&I)) setWideningDecision(Group, VF, Decision, Cost); else setWideningDecision(&I, VF, Decision, Cost); } } // Make sure that any load of address and any other address computation // remains scalar unless there is gather/scatter support. This avoids // inevitable extracts into address registers, and also has the benefit of // activating LSR more, since that pass can't optimize vectorized // addresses. if (TTI.prefersVectorizedAddressing()) return; // Start with all scalar pointer uses. SmallPtrSet AddrDefs; for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { Instruction *PtrDef = dyn_cast_or_null(getLoadStorePointerOperand(&I)); if (PtrDef && TheLoop->contains(PtrDef) && getWideningDecision(&I, VF) != CM_GatherScatter) AddrDefs.insert(PtrDef); } // Add all instructions used to generate the addresses. SmallVector Worklist; append_range(Worklist, AddrDefs); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) if (auto *InstOp = dyn_cast(Op)) if ((InstOp->getParent() == I->getParent()) && !isa(InstOp) && AddrDefs.insert(InstOp).second) Worklist.push_back(InstOp); } for (auto *I : AddrDefs) { if (isa(I)) { // Setting the desired widening decision should ideally be handled in // by cost functions, but since this involves the task of finding out // if the loaded register is involved in an address computation, it is // instead changed here when we know this is the case. InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. setWideningDecision( I, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) setWideningDecision( Member, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } } else // Make sure I gets scalarized and a cost estimate without // scalarization overhead. ForcedScalars[VF].insert(I); } } InstructionCost LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto hasSingleCopyAfterVectorization = [this](Instruction *I, ElementCount VF) -> bool { if (VF.isScalar()) return true; auto Scalarized = InstsToScalarize.find(VF); assert(Scalarized != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); return !Scalarized->second.count(I) && llvm::all_of(I->users(), [&](User *U) { auto *UI = cast(U); return !Scalarized->second.count(UI); }); }; (void) hasSingleCopyAfterVectorization; if (isScalarAfterVectorization(I, VF)) { // With the exception of GEPs and PHIs, after scalarization there should // only be one copy of the instruction generated in the loop. This is // because the VF is either 1, or any instructions that need scalarizing // have already been dealt with by the the time we get here. As a result, // it means we don't have to multiply the instruction cost by VF. assert(I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else VectorTy = ToVectorTy(RetTy, VF); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: // We mark this instruction as zero-cost because the cost of GEPs in // vectorized code depends on whether the corresponding memory instruction // is scalarized or not. Therefore, we handle GEPs with the memory // instruction cost. return 0; case Instruction::Br: { // In cases of scalarized and predicated instructions, there will be VF // predicated blocks in the vectorized loop. Each branch around these // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast(I); if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Not possible to scalarize scalable vector with predicated instructions. if (VF.isScalable()) return InstructionCost::getInvalid(); // Return cost for branches around scalarized and predicated blocks. auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, /*Extract*/ true, CostKind) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else // This branch will be eliminated by if-conversion. return 0; // Note: We currently assume zero cost for an unconditional branch inside // a predicated block since it will become a fall-through, although we // may decide in the future to call TTI for all branches. } case Instruction::PHI: { auto *Phi = cast(I); // First-order recurrences are replaced by vector shuffles inside the loop. if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { SmallVector Mask(VF.getKnownMinValue()); std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, cast(VectorTy), Mask, CostKind, VF.getKnownMinValue() - 1); } // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); return TTI.getCFInstrCost(Instruction::PHI, CostKind); } case Instruction::UDiv: case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: if (VF.isVector() && isPredicatedInst(I)) { const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? ScalarCost : SafeDivisorCost; } // We've proven all lanes safe to speculate, fall through. [[fallthrough]]; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { // If we're speculating on the stride being 1, the multiplication may // fold away. We can generalize this for all operations using the notion // of neutral elements. (TODO) if (I->getOpcode() == Instruction::Mul && (PSE.getSCEV(I->getOperand(0))->isOne() || PSE.getSCEV(I->getOperand(1))->isOne())) return 0; // Detect reduction patterns if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); auto Op2Info = TTI.getOperandInfo(Op2); if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); return TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, I); } case Instruction::FNeg: { return TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); const Value *Op0, *Op1; using namespace llvm::PatternMatch; if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { // select x, y, false --> x & y // select x, true, y --> x | y const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); assert(Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1); SmallVector Operands{Op0, Op1}; return TTI.getArithmeticInstrCost( match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); } Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; if (auto *Cmp = dyn_cast(SI->getCondition())) Pred = Cmp->getPredicate(); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, cast(I)->getPredicate(), CostKind, I); } case Instruction::Store: case Instruction::Load: { ElementCount Width = VF; if (Width.isVector()) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (getWideningCost(I, VF) == InstructionCost::getInvalid()) return InstructionCost::getInvalid(); if (Decision == CM_Scalarize) Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getLoadStoreType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::BitCast: if (I->getType()->isPointerTy()) return 0; [[fallthrough]]; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: { // Computes the CastContextHint from a Load/Store instruction. auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { assert((isa(I) || isa(I)) && "Expected a load or a store!"); if (VF.isScalar() || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: case LoopVectorizationCostModel::CM_Widen: return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked : TTI::CastContextHint::Normal; case LoopVectorizationCostModel::CM_Widen_Reverse: return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: llvm_unreachable("Instr did not go through cost modelling?"); } llvm_unreachable("Unhandled case!"); }; unsigned Opcode = I->getOpcode(); TTI::CastContextHint CCH = TTI::CastContextHint::None; // For Trunc, the context is the only user, which must be a StoreInst. if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { if (I->hasOneUse()) if (StoreInst *Store = dyn_cast(*I->user_begin())) CCH = ComputeCCH(Store); } // For Z/Sext, the context is the operand, which must be a LoadInst. else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || Opcode == Instruction::FPExt) { if (LoadInst *Load = dyn_cast(I->getOperand(0))) CCH = ComputeCCH(Load); } // We optimize the truncation of induction variables having constant // integer steps. The cost of these truncations is the same as the scalar // operation. if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), Trunc->getSrcTy(), CCH, CostKind, Trunc); } // Detect reduction patterns if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; if (canTruncateToMinimalBitwidth(I, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". // // Calculate the modified src and dest types. Type *MinVecTy = VectorTy; if (Opcode == Instruction::Trunc) { SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { // Leave SrcVecTy unchanged - we only shrink the destination element // type. VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } } return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; Function *Variant; CallInst *CI = cast(I); InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); } return CallCost; } case Instruction::ExtractValue: return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); case Instruction::Alloca: // We cannot easily widen alloca to a scalable alloca, as // the result would need to be a vector of pointers. if (VF.isScalable()) return InstructionCost::getInvalid(); [[fallthrough]]; default: // This opcode is unknown. Assume that it is the same as 'mul'. return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); } // end of switch. } void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); // Find all stores to invariant variables. Since they are going to sink // outside the loop we do not need calculate cost for them. for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { StoreInst *SI; if ((SI = dyn_cast(&I)) && Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) ValuesToIgnore.insert(&I); } // Ignore type-promoting instructions we identified during reduction // detection. for (const auto &Reduction : Legal->getReductionVars()) { const RecurrenceDescriptor &RedDes = Reduction.second; const SmallPtrSetImpl &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. for (const auto &Induction : Legal->getInductionVars()) { const InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } } void LoopVectorizationCostModel::collectInLoopReductions() { for (const auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = Reduction.second; // We don't collect reductions that are type promoted (yet). if (RdxDesc.getRecurrenceType() != Phi->getType()) continue; // If the target would prefer this reduction to happen "in-loop", then we // want to record it as such. unsigned Opcode = RdxDesc.getOpcode(); if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && !TTI.preferInLoopReduction(Opcode, Phi->getType(), TargetTransformInfo::ReductionFlags())) continue; // Check that we can correctly put the reductions into the loop, by // finding the chain of operations that leads from the phi to the loop // exit value. SmallVector ReductionOperations = RdxDesc.getReductionOpChain(Phi, TheLoop); bool InLoop = !ReductionOperations.empty(); if (InLoop) { InLoopReductionChains[Phi] = ReductionOperations; // Add the elements to InLoopReductionImmediateChains for cost modelling. Instruction *LastChain = Phi; for (auto *I : ReductionOperations) { InLoopReductionImmediateChains[I] = LastChain; LastChain = I; } } LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") << " reduction for phi: " << *Phi << "\n"); } } // TODO: we could return a pair of values that specify the max VF and // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment // doesn't have a cost model that can choose which plan to execute if // more than one is generated. static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, LoopVectorizationCostModel &CM) { unsigned WidestType; std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); return WidestVectorRegBits / WidestType; } VectorizationFactor LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. if (!OrigLoop->isInnermost()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. if (UserVF.isZero()) { VF = ElementCount::getFixed(determineVPlanVF( TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"); VF = ElementCount::getFixed(4); } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); assert(isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " "VPlan-native path.\n"); return VectorizationFactor::Disabled(); } std::optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return std::nullopt; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && !useMaskedInterleavedAccesses(TTI)) { LLVM_DEBUG( dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"); if (CM.InterleaveInfo.invalidateGroups()) // Invalidating interleave groups also requires invalidating all decisions // based on them, which includes widening decisions and uniform and scalar // values. CM.invalidateCostModelingDecisions(); } ElementCount MaxUserVF = UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); if (!UserVF.isZero() && UserVFIsLegal) { assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); if (!hasPlanWithVF(UserVF)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF << ".\n"); return std::nullopt; } LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); } // Populate the set of Vectorization Factor Candidates. ElementCountSet VFCandidates; for (auto VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) VFCandidates.insert(VF); for (auto VF = ElementCount::getScalable(1); ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) VFCandidates.insert(VF); for (const auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. if (VF.isVector()) CM.collectInstsToScalarize(VF); } CM.collectInLoopReductions(); buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); LLVM_DEBUG(printPlans(dbgs())); if (!MaxFactors.hasVector()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. VectorizationFactor VF = selectVectorizationFactor(VFCandidates); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); if (!hasPlanWithVF(VF.Width)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width << ".\n"); return std::nullopt; } return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."); for (const VPlanPtr &Plan : VPlans) { if (Plan->hasVF(VF)) return *Plan.get(); } llvm_unreachable("No plan found!"); } static void AddRuntimeUnrollDisableMetaData(Loop *L) { SmallVector MDs; // Reserve first location for self reference to the LoopID metadata node. MDs.push_back(nullptr); bool IsUnrollMetadata = false; MDNode *LoopID = L->getLoopID(); if (LoopID) { // First find existing loop unrolling disable metadata. for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { auto *MD = dyn_cast(LoopID->getOperand(i)); if (MD) { const auto *S = dyn_cast(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.disable"); } MDs.push_back(LoopID->getOperand(i)); } } if (!IsUnrollMetadata) { // Add runtime unroll disable metadata. LLVMContext &Context = L->getHeader()->getContext(); SmallVector DisableOperands; DisableOperands.push_back( MDString::get(Context, "llvm.loop.unroll.runtime.disable")); MDNode *DisableNode = MDNode::get(Context, DisableOperands); MDs.push_back(DisableNode); MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); L->setLoopID(NewLoopID); } } SCEV2ValueTy LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, DenseMap *ExpandedSCEVs) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); assert( (IsEpilogueVectorization || !ExpandedSCEVs) && "expanded SCEVs to reuse can only be used during epilogue vectorization"); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); if (!IsEpilogueVectorization) VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); // Perform the actual loop transformation. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; // 0. Generate SCEV-dependent code into the preheader, including TripCount, // before making any changes to the CFG. if (!BestVPlan.getPreheader()->empty()) { State.CFG.PrevBB = OrigLoop->getLoopPreheader(); State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); BestVPlan.getPreheader()->execute(&State); } if (!ILV.getTripCount()) ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); else assert(IsEpilogueVectorization && "should only re-use the existing trip " "count during epilogue vectorization"); // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. const LoopAccessInfo *LAI = ILV.Legal->getLAI(); std::unique_ptr LVer = nullptr; if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && !LAI->getRuntimePointerChecking()->getDiffChecks()) { // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. // TODO: Find a better way to re-use LoopVersioning functionality to add // metadata. LVer = std::make_unique( *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); State.LVer = &*LVer; State.LVer->prepareNoAliasMetadata(); } ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); //===------------------------------------------------===// // // Notice: any optimization or new instruction that go // into the code below should also be implemented in // the cost-model. // //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. BestVPlan.prepareToExecute( ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), CanonicalIVStartValue, State, IsEpilogueVectorization); BestVPlan.execute(&State); // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). MDNode *OrigLoopID = OrigLoop->getLoopID(); std::optional VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); VPBasicBlock *HeaderVPBB = BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) L->setLoopID(*VectorizedLoopID); else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) L->setLoopID(LID); LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } TargetTransformInfo::UnrollingPreferences UP; TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) AddRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(State, BestVPlan); ILV.printDebugTracesAtEnd(); return State.ExpandedSCEVs; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { for (const auto &Plan : VPlans) if (PrintVPlansInDotFormat) Plan->printDOT(O); else Plan->print(O); } #endif //===--------------------------------------------------------------------===// // EpilogueVectorizerMainLoop //===--------------------------------------------------------------------===// /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). EPI.EpilogueIterationCountCheck = emitIterationCountCheck(LoopScalarPreHeader, true); EPI.EpilogueIterationCountCheck->setName("iter.check"); // Generate the code to check any assumptions that we've made for SCEV // expressions. EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case // that goes directly through the vector epilogue. The longer-path length for // the main loop is compensated for, by the gain from vectorizing the larger // trip count. Note: the branch will get updated later on when we vectorize // the epilogue. EPI.MainLoopIterationCountCheck = emitIterationCountCheck(LoopScalarPreHeader, false); // Generate the induction variable. EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); // Skip induction resume value creation here because they will be created in // the second pass for the scalar loop. The induction resume values for the // inductions in the epilogue loop are created before executing the plan for // the epilogue loop. return {completeLoopSkeleton(), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }); } void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { dbgs() << "intermediate fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }); } BasicBlock * EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue) { assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF of the // main vector loop. auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() : VF.isVector()) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), "min.iters.check"); if (!ForEpilogue) TCCheckBlock->setName("vector.main.loop.iter.check"); // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, "vector.ph"); if (ForEpilogue) { assert(DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"); // Update dominator for Bypass & LoopExit. DT->changeImmediateDominator(Bypass, TCCheckBlock); if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) // For loops with multiple exits, there's no edge from the middle block // to exit blocks (as the epilogue must run) and thus no need to update // the immediate dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); LoopBypassBlocks.push_back(TCCheckBlock); // Save the trip count so we don't have to regenerate it in the // vec.epilog.iter.check. This is safe to do because the trip count // generated here dominates the vector epilog iter check. EPI.TripCount = Count; } ReplaceInstWithInst( TCCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); return TCCheckBlock; } //===--------------------------------------------------------------------===// // EpilogueVectorizerEpilogueLoop //===--------------------------------------------------------------------===// /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); LoopVectorPreHeader = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, "vec.epilog.ph"); emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); // Adjust the control flow taking the state info from the main loop // vectorization into account. assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."); EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopVectorPreHeader); DT->changeImmediateDominator(LoopVectorPreHeader, EPI.MainLoopIterationCountCheck); EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopScalarPreHeader); if (EPI.SCEVSafetyCheck) EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopScalarPreHeader); if (EPI.MemSafetyCheck) EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( VecEpilogueIterationCountCheck, LoopScalarPreHeader); DT->changeImmediateDominator( VecEpilogueIterationCountCheck, VecEpilogueIterationCountCheck->getSinglePredecessor()); DT->changeImmediateDominator(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); // Keep track of bypass blocks, as they feed start values to the induction and // reduction phis in the scalar loop preheader. if (EPI.SCEVSafetyCheck) LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); if (EPI.MemSafetyCheck) LoopBypassBlocks.push_back(EPI.MemSafetyCheck); LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); // The vec.epilog.iter.check block may contain Phi nodes from inductions or // reductions which merge control-flow from the latch block and the middle // block. Update the incoming values here and move the Phi into the preheader. SmallVector PhisInBlock; for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) PhisInBlock.push_back(&Phi); for (PHINode *Phi : PhisInBlock) { Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); // If the phi doesn't have an incoming value from the // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming // value and also those from other check blocks. This is needed for // reduction phis only. if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { return EPI.EpilogueIterationCountCheck == IncB; })) continue; Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); if (EPI.SCEVSafetyCheck) Phi->removeIncomingValue(EPI.SCEVSafetyCheck); if (EPI.MemSafetyCheck) Phi->removeIncomingValue(EPI.MemSafetyCheck); } // Generate a resume induction for the vector epilogue and put it in the // vector epilogue preheader Type *IdxTy = Legal->getWidestInductionType(); PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", LoopVectorPreHeader->getFirstNonPHI()); EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), EPI.MainLoopIterationCountCheck); // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail // iterations left once the vector loop has completed. // Note that when the vectorized epilogue is skipped due to iteration count // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. createInductionResumeValues(ExpandedSCEVs, {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); return {completeLoopSkeleton(), EPResumeVal}; } BasicBlock * EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && "Expected trip count to have been safed in the first pass."); assert( (!isa(EPI.TripCount) || DT->dominates(cast(EPI.TripCount)->getParent(), Insert)) && "saved trip count does not dominate insertion point."); Value *TC = EPI.TripCount; IRBuilder<> Builder(Insert->getTerminator()); Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); // Generate code to check if the loop's trip count is less than VF * UF of the // vector epilogue loop. auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp(P, Count, createStepForVF(Builder, Count->getType(), EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); ReplaceInstWithInst( Insert->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); LoopBypassBlocks.push_back(Insert); return Insert; } void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }); } void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }); } bool LoopVectorizationPlanner::getDecisionAndClampRange( const std::function &Predicate, VFRange &Range) { assert(!Range.isEmpty() && "Trying to test an empty VF range."); bool PredicateAtRangeStart = Predicate(Range.Start); for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; } return PredicateAtRangeStart; } /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, ElementCount MaxVF) { auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } } VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. std::pair Edge(Src, Dst); EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); if (ECEntryIt != EdgeMaskCache.end()) return ECEntryIt->second; VPValue *SrcMask = createBlockInMask(Src, Plan); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast(Src->getTerminator()); assert(BI && "Unexpected terminator found"); if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; // If source is an exiting block, we know the exit edge is dynamically dead // in the vector loop, and thus we don't need to restrict the mask. Avoid // adding uses of an otherwise potentially dead instruction. if (OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. // The condition is 'SrcMask && EdgeMask', which is equivalent to // 'select i1 SrcMask, i1 EdgeMask, i1 false'. // The select version does not introduce new UB if SrcMask is false and // EdgeMask is poison. Using 'and' here introduces undefined behavior. VPValue *False = Plan.getVPValueOrAddLiveIn( ConstantInt::getFalse(BI->getCondition()->getType())); EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); } return EdgeMaskCache[Edge] = EdgeMask; } VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Look for cached value. BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); if (BCEntryIt != BlockMaskCache.end()) return BCEntryIt->second; // All-one mask is modelled as no-mask following the convention for masked // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; if (OrigLoop->getHeader() == BB) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. assert(CM.foldTailByMasking() && "must fold the tail"); // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); if (useActiveLaneMaskForControlFlow(TFStyle)) return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); if (useActiveLaneMask(TFStyle)) { VPValue *TC = Plan.getTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); } else { VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; } // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. return BlockMaskCache[BB] = EdgeMask; if (!BlockMask) { // BlockMask has its initialized nullptr value. BlockMask = EdgeMask; continue; } BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } return BlockMaskCache[BB] = BlockMask; } VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range, VPlanPtr &Plan) { assert((isa(I) || isa(I)) && "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); if (Decision == LoopVectorizationCostModel::CM_Interleave) return true; if (CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF)) return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return nullptr; VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), *Plan); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; if (LoadInst *Load = dyn_cast(I)) return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, Consecutive, Reverse); StoreInst *Store = cast(I); return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], Mask, Consecutive, Reverse); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also /// insert a recipe to expand the step for the induction recipe. static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { assert(IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && "step must be loop invariant"); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast(PhiOrTrunc)) { return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); } assert(isa(PhiOrTrunc) && "must be a phi node here"); return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( PHINode *Phi, ArrayRef Operands, VPlan &Plan, VFRange &Range) { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, *PSE.getSE(), *OrigLoop, Range); // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), *PSE.getSE()); return new VPWidenPointerInductionRecipe( Phi, Operands[0], Step, *II, LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isScalarAfterVectorization(Phi, VF); }, Range)); } return nullptr; } VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( TruncInst *I, ArrayRef Operands, VFRange &Range, VPlan &Plan) { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and // (c) other casts depend on pointer size. // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = [&](Instruction *K) -> std::function { return [=](ElementCount VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( isOptimizableIVTruncate(I), Range)) { auto *Phi = cast(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), *OrigLoop, Range); } return nullptr; } VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, ArrayRef Operands, VPlanPtr &Plan) { // If all incoming values are equal, the incoming VPValue can be used directly // instead of creating a new VPBlendRecipe. if (llvm::all_equal(Operands)) return Operands[0]; unsigned NumIncoming = Phi->getNumIncomingValues(); // For in-loop reductions, we do not need to create an additional select. VPValue *InLoopVal = nullptr; for (unsigned In = 0; In < NumIncoming; In++) { PHINode *PhiOp = dyn_cast_or_null(Operands[In]->getUnderlyingValue()); if (PhiOp && CM.isInLoopReduction(PhiOp)) { assert(!InLoopVal && "Found more than one in-loop reduction!"); InLoopVal = Operands[In]; } } assert((!InLoopVal || NumIncoming == 2) && "Found an in-loop reduction for PHI with unexpected number of " "incoming values"); if (InLoopVal) return Operands[Operands[0] == InLoopVal ? 1 : 0]; // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. SmallVector OperandsWithMask; for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); assert((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"); OperandsWithMask.push_back(Operands[In]); if (EdgeMask) OperandsWithMask.push_back(EdgeMask); } return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); } VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef Operands, VFRange &Range, VPlanPtr &Plan) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); }, Range); if (IsPredicated) return nullptr; Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || ID == Intrinsic::pseudoprobe || ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; SmallVector Ops(Operands.take_front(CI->arg_size())); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = ID && LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { Function *Variant; // Is it beneficial to perform intrinsic call compared to lib // call? InstructionCost CallCost = CM.getVectorCallCost(CI, VF, &Variant); InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); return IntrinsicCost <= CallCost; }, Range); if (ShouldUseVectorIntrinsic) return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); Function *Variant = nullptr; ElementCount VariantVF; bool NeedsMask = false; // Is better to call a vectorized version of the function than to to scalarize // the call? auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { // The following case may be scalarized depending on the VF. // The flag shows whether we can use a usual Call for vectorized // version of the instruction. // If we've found a variant at a previous VF, then stop looking. A // vectorized variant of a function expects input in a certain shape // -- basically the number of input registers, the number of lanes // per register, and whether there's a mask required. // We store a pointer to the variant in the VPWidenCallRecipe, so // once we have an appropriate variant it's only valid for that VF. // This will force a different vplan to be generated for each VF that // finds a valid variant. if (Variant) return false; CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); // If we found a valid vector variant at this VF, then store the VF // in case we need to generate a mask. if (Variant) VariantVF = VF; return Variant != nullptr; }, Range); if (ShouldUseVectorCall) { if (NeedsMask) { // We have 2 cases that would require a mask: // 1) The block needs to be predicated, either due to a conditional // in the scalar loop or use of an active lane mask with // tail-folding, and we use the appropriate mask for the block. // 2) No mask is required for the block, but the only available // vector variant at this VF requires a mask, so we synthesize an // all-true mask. VPValue *Mask = nullptr; if (Legal->isMaskRequired(CI)) Mask = createBlockInMask(CI->getParent(), *Plan); else Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); unsigned MaskPos = 0; for (const VFInfo &Info : VFDatabase::getMappings(*CI)) if (Info.Shape == Shape) { assert(Info.isMasked() && "Vector function info shape mismatch"); MaskPos = Info.getParamIndexForOptionalMask().value(); break; } Ops.insert(Ops.begin() + MaskPos, Mask); } return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), Intrinsic::not_intrinsic, Variant); } return nullptr; } bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { assert(!isa(I) && !isa(I) && !isa(I) && !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); } VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, ArrayRef Operands, VPBasicBlock *VPBB, VPlanPtr &Plan) { switch (I->getOpcode()) { default: return nullptr; case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: case Instruction::URem: { // If not provably safe, use a select to form a safe divisor before widening the // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector Ops(Operands.begin(), Operands.end()); VPValue *Mask = createBlockInMask(I->getParent(), *Plan); VPValue *One = Plan->getVPValueOrAddLiveIn( ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, I->getDebugLoc()); VPBB->appendRecipe(SafeRHS); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); } [[fallthrough]]; } case Instruction::Add: case Instruction::And: case Instruction::AShr: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: case Instruction::FMul: case Instruction::FNeg: case Instruction::FRem: case Instruction::FSub: case Instruction::ICmp: case Instruction::LShr: case Instruction::Mul: case Instruction::Or: case Instruction::Select: case Instruction::Shl: case Instruction::Sub: case Instruction::Xor: case Instruction::Freeze: return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); }; } void VPRecipeBuilder::fixHeaderPhis() { BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); for (VPHeaderPHIRecipe *R : PhisToFix) { auto *PN = cast(R->getUnderlyingValue()); VPRecipeBase *IncR = getRecipe(cast(PN->getIncomingValueForBlock(OrigLatch))); R->addOperand(IncR->getVPSingleValue()); } } VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, VFRange &Range, VPlan &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = CM.isPredicatedInst(I); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for // them here. Conservatively, we only do this for scalable vectors, since // for fixed-width VFs we can always fall back on full scalarization. if (!IsUniform && Range.Start.isScalable() && isa(I)) { switch (cast(I)->getIntrinsicID()) { case Intrinsic::assume: case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // For scalable vectors if one of the operands is variant then we still // want to mark as uniform, which will generate one instruction for just // the first lane of the vector. We can't scalarize the call in the same // way as for fixed-width vectors because we don't know how many lanes // there are. // // The reasons for doing it this way for scalable vectors are: // 1. For the assume intrinsic generating the instruction for the first // lane is still be better than not generating any at all. For // example, the input may be a splat across all lanes. // 2. For the lifetime start/end intrinsics the pointer operand only // does anything useful when the input comes from a stack object, // which suggests it should always be uniform. For non-stack objects // the effect is to poison the object, which still allows us to // remove the call. IsUniform = true; break; default: break; } } VPValue *BlockInMask = nullptr; if (!IsPredicated) { // Finalize the recipe for Instr, first if it is not predicated. LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); } else { LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); // Instructions marked for predication are replicated and a mask operand is // added initially. Masked replicate recipes will later be placed under an // if-then construct to prevent side-effects. Generate recipes to compute // the block mask for this region. BlockInMask = createBlockInMask(I->getParent(), Plan); } auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), IsUniform, BlockInMask); return toVPRecipeResult(Recipe); } VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan) { // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; if (auto Phi = dyn_cast(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); // Always record recipes for header phis. Later first-order recurrence phis // can have earlier phis as incoming values. recordRecipeOf(Phi); if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; assert((Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"); VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), CM.useOrderedReductions(RdxDesc)); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate // recurrences in the chain, the fixed order recurrence should be modeled // directly, enabling more efficient codegen. PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } // Record the incoming value from the backedge, so we can add the incoming // value from the backedge after all recipes have been created. auto *Inc = cast( Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); auto RecipeIter = Ingredient2Recipe.find(Inc); if (RecipeIter == Ingredient2Recipe.end()) recordRecipeOf(Inc); PhisToFix.push_back(PhiRecipe); return toVPRecipeResult(PhiRecipe); } if (isa(Instr) && (Recipe = tryToOptimizeInductionTruncate(cast(Instr), Operands, Range, *Plan))) return toVPRecipeResult(Recipe); // All widen recipes below deal only with VF > 1. if (LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return VF.isScalar(); }, Range)) return nullptr; if (auto *CI = dyn_cast(Instr)) return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); if (isa(Instr) || isa(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); if (!shouldWiden(Instr, Range)) return nullptr; if (auto GEP = dyn_cast(Instr)) return toVPRecipeResult(new VPWidenGEPRecipe( GEP, make_range(Operands.begin(), Operands.end()))); if (auto *SI = dyn_cast(Instr)) { return toVPRecipeResult(new VPWidenSelectRecipe( *SI, make_range(Operands.begin(), Operands.end()))); } if (auto *CI = dyn_cast(Instr)) { return toVPRecipeResult( new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI)); } return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. SmallPtrSet DeadInstructions; auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) VPlans.push_back(std::move(*Plan)); VF = SubRange.End; } } // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, TailFoldingStyle Style) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, {CanonicalIVPHI}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); if (useActiveLaneMaskForControlFlow(Style)) { // Create the active lane mask instruction in the vplan preheader. VPBasicBlock *VecPreheader = cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); // We can't use StartV directly in the ActiveLaneMask VPInstruction, since // we have to take unrolling into account. Each part needs to start at // Part * VF auto *CanonicalIVIncrementParts = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW : VPInstruction::CanonicalIVIncrementForPart, {StartV}, DL, "index.part.next"); VecPreheader->appendRecipe(CanonicalIVIncrementParts); // Create the ActiveLaneMask instruction using the correct start values. VPValue *TC = Plan.getTripCount(); VPValue *TripCount, *IncrementValue; if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // When avoiding a runtime check, the active.lane.mask inside the loop // uses a modified trip count and the induction variable increment is // done after the active.lane.mask intrinsic is called. auto *TCMinusVF = new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); VecPreheader->appendRecipe(TCMinusVF); IncrementValue = CanonicalIVPHI; TripCount = TCMinusVF; } else { // When the loop is guarded by a runtime overflow check for the loop // induction variable increment by VF, we can increment the value before // the get.active.lane mask and use the unmodified tripcount. EB->appendRecipe(CanonicalIVIncrement); IncrementValue = CanonicalIVIncrement; TripCount = TC; } auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, {CanonicalIVIncrementParts, TC}, DL, "active.lane.mask.entry"); VecPreheader->appendRecipe(EntryALM); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); // Create the active lane mask for the next iteration of the loop. CanonicalIVIncrementParts = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW : VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, DL); EB->appendRecipe(CanonicalIVIncrementParts); auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, {CanonicalIVIncrementParts, TripCount}, DL, "active.lane.mask.next"); EB->appendRecipe(ALM); LaneMaskPhi->addOperand(ALM); if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // Do the increment of the canonical IV after the active.lane.mask, because // that value is still based off %CanonicalIVPHI EB->appendRecipe(CanonicalIVIncrement); } // We have to invert the mask here because a true condition means jumping // to the exit block. auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); EB->appendRecipe(NotMask); VPInstruction *BranchBack = new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); EB->appendRecipe(BranchBack); } else { EB->appendRecipe(CanonicalIVIncrement); // Add the BranchOnCount VPInstruction to the latch. VPInstruction *BranchBack = new VPInstruction( VPInstruction::BranchOnCount, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); EB->appendRecipe(BranchBack); } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the // original exit block. static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, VPBasicBlock *MiddleVPBB, Loop *OrigLoop, VPlan &Plan) { BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); // Only handle single-exit loops with unique exit blocks for now. if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) return; // Introduce VPUsers modeling the exit values. for (PHINode &ExitPhi : ExitBB->phis()) { Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); Plan.addLiveOut(&ExitPhi, V); } } std::optional LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further // process after constructing the initial VPlan. // --------------------------------------------------------------------------- for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); const SmallVector &ReductionOperations = Reduction.second; RecipeBuilder.recordRecipeOf(Phi); for (const auto &R : ReductionOperations) { RecipeBuilder.recordRecipeOf(R); // For min/max reductions, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) RecipeBuilder.recordRecipeOf(cast(R->getOperand(0))); } } // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { auto applyIG = [IG, this](ElementCount VF) -> bool { bool Result = (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported // is 2 since we require the (de)interleave2 intrinsics instead of // shufflevectors. assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && "Unsupported interleave factor for scalable vectors"); return Result; }; if (!getDecisionAndClampRange(applyIG, Range)) continue; InterleaveGroups.insert(IG); for (unsigned i = 0; i < IG->getFactor(); i++) if (Instruction *Member = IG->getMember(i)) RecipeBuilder.recordRecipeOf(Member); }; // --------------------------------------------------------------------------- // Build initial VPlan: Scan the body of the loop in a topological order to // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- // Create initial VPlan skeleton, having a basic block for the pre-header // which contains SCEV expansions that need to happen before the CFG is // modified; a basic block for the vector pre-header, followed by a region for // the vector loop, followed by the middle basic block. The skeleton vector // loop region contains a header and latch basic blocks. VPlanPtr Plan = VPlan::createInitialVPlan( createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), *PSE.getSE()); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); // Don't use getDecisionAndClampRange here, because we don't know the UF // so this function is better to be conservative, rather than to split // it up into different VPlans. bool IVUpdateMayOverflow = false; for (ElementCount VF : Range) IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), CM.getTailFoldingStyle(IVUpdateMayOverflow)); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. if (VPBB != HeaderVPBB) VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug(false)) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are // built for them. if (isa(Instr) || DeadInstructions.count(Instr)) continue; SmallVector Operands; auto *Phi = dyn_cast(Instr); if (Phi && Phi->getParent() == OrigLoop->getHeader()) { Operands.push_back(Plan->getVPValueOrAddLiveIn( Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); } else { auto OpRange = Plan->mapToVPValues(Instr->operands()); Operands = {OpRange.begin(), OpRange.end()}; } // Invariant stores inside loop will be deleted and a single store // with the final reduction value will be added to the exit block StoreInst *SI; if ((SI = dyn_cast(&I)) && Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) continue; auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( Instr, Operands, Range, VPBB, Plan); if (!RecipeOrValue) RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); // If Instr can be simplified to an existing VPValue, use it. if (isa(RecipeOrValue)) { auto *VPV = cast(RecipeOrValue); Plan->addVPValue(Instr, VPV); // If the re-used value is a recipe, register the recipe for the // instruction, in case the recipe for Instr needs to be recorded. if (VPRecipeBase *R = VPV->getDefiningRecipe()) RecipeBuilder.setRecipe(Instr, R); continue; } // Otherwise, add the new recipe. VPRecipeBase *Recipe = cast(RecipeOrValue); for (auto *Def : Recipe->definedValues()) { auto *UV = Def->getUnderlyingValue(); Plan->addVPValue(UV, Def); } RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe) && HeaderVPBB->getFirstNonPhi() != VPBB->end()) { // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the // phi section of HeaderVPBB. assert(isa(Instr)); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else VPBB->appendRecipe(Recipe); } VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); VPBB = cast(VPBB->getSingleSuccessor()); } // After here, VPBB should not be used. VPBB = nullptr; if (CM.requiresScalarEpilogue(Range)) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. } else addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. // --------------------------------------------------------------------------- // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. for (const auto *IG : InterleaveGroups) { auto *Recipe = cast( RecipeBuilder.getRecipe(IG->getInsertPos())); SmallVector StoredValues; for (unsigned i = 0; i < IG->getFactor(); ++i) if (auto *SI = dyn_cast_or_null(IG->getMember(i))) { auto *StoreR = cast(RecipeBuilder.getRecipe(SI)); StoredValues.push_back(StoreR->getStoredValue()); } bool NeedsMaskForGaps = IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, Recipe->getMask(), NeedsMaskForGaps); VPIG->insertBefore(Recipe); unsigned J = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); if (!Member->getType()->isVoidTy()) { VPValue *OriginalV = MemberR->getVPSingleValue(); OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); J++; } MemberR->eraseFromParent(); } } for (ElementCount VF : Range) Plan->addVF(VF); Plan->setName("Initial VPlan"); // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { auto *StrideV = cast(Stride)->getValue(); auto *ScevStride = dyn_cast(PSE.getSCEV(StrideV)); // Only handle constant strides for now. if (!ScevStride) continue; Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); // The versioned value may not be used in the loop directly, so just add a // new live-in in those cases. Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); } // From this point onwards, VPlan-to-VPlan transformations may change the plan // in ways that accessing values using original IR values is incorrect. Plan->disableValue2VPValue(); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) return std::nullopt; VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::removeDeadRecipes(*Plan); VPlanTransforms::createAndOptimizeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return std::make_optional(std::move(Plan)); } VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan auto Plan = VPlan::createInitialVPlan( createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), *PSE.getSE()); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); for (ElementCount VF : Range) Plan->addVF(VF); VPlanTransforms::VPInstructionsToVPRecipes( Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, *PSE.getSE(), *TLI); // Remove the existing terminator of the exiting block of the top-most region. // A BranchOnCount will be added instead when adding the canonical IV recipes. auto *Term = Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), CM.getTailFoldingStyle()); return Plan; } // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be converted // to reductions, with one operand being vector and the other being the scalar // reduction chain. For other reductions, a select is introduced between the phi // and live-out recipes when folding the tail. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(Phi)->second; const SmallVector &ReductionOperations = Reduction.second; if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) continue; // ReductionOperations are orders top-down from the phi's use to the // LoopExitValue. We keep a track of the previous item (the Chain) to tell // which of the two operands will remain scalar and which will be reduced. // For minmax the chain will be the select instructions. Instruction *Chain = Phi; for (Instruction *R : ReductionOperations) { VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); RecurKind Kind = RdxDesc.getRecurrenceKind(); VPValue *ChainOp = Plan->getVPValue(Chain); unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); // Recognize a call to the llvm.fmuladd intrinsic. bool IsFMulAdd = (Kind == RecurKind::FMulAdd); assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { assert((MinVF.isScalar() || isa(WidenRecipe) || (IsFMulAdd && isa(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } unsigned VecOpId = R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); VPValue *CondOp = nullptr; if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(WidenRecipe->getParent(), WidenRecipe->getIterator()); CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan); } if (IsFMulAdd) { // If the instruction is a call to the llvm.fmuladd intrinsic then we // need to create an fmul recipe to use as the vector operand for the // fadd reduction. VPInstruction *FMulRecipe = new VPInstruction( Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); FMulRecipe->setFastMathFlags(R->getFastMathFlags()); WidenRecipe->getParent()->insert(FMulRecipe, WidenRecipe->getIterator()); VecOp = FMulRecipe; } VPReductionRecipe *RedRecipe = new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. WidenRecipe->getParent()->appendRecipe(RedRecipe); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { VPRecipeBase *CompareRecipe = RecipeBuilder.getRecipe(cast(R->getOperand(0))); assert(isa(CompareRecipe) && "Expected to replace a VPWidenSC"); assert(cast(CompareRecipe)->getNumUsers() == 0 && "Expected no remaining users"); CompareRecipe->eraseFromParent(); } Chain = R; } } // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. if (CM.foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast(&R); if (!PhiR || PhiR->isInLoop()) continue; VPValue *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); VPValue *Red = PhiR->getBackedgeValue(); assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } VPlanTransforms::clearReductionWrapFlags(*Plan); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; getAddr()->printAsOperand(O, SlotTracker); VPValue *Mask = getMask(); if (Mask) { O << ", "; Mask->printAsOperand(O, SlotTracker); } unsigned OpIdx = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) { if (!IG->getMember(i)) continue; if (getNumStoreOperands() > 0) { O << "\n" << Indent << " store "; getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); O << " to index " << i; } else { O << "\n" << Indent << " "; getVPValue(OpIdx)->printAsOperand(O, SlotTracker); O << " = load from index " << i; } ++OpIdx; } } #endif void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); Value *Start = getStartValue()->getLiveInIRValue(); const InductionDescriptor &ID = getInductionDescriptor(); TruncInst *Trunc = getTruncInst(); IRBuilderBase &Builder = State.Builder; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); assert(State.VF.isVector() && "must have vector VF"); // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast(Trunc) : IV; // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(Builder); if (ID.getInductionBinOp() && isa(ID.getInductionBinOp())) Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); // Now do the actual transformations, and start with fetching the step value. Value *Step = State.get(getStepValue(), VPIteration(0, 0)); assert((isa(EntryVal) || isa(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); Builder.SetInsertPoint(VectorPH->getTerminator()); if (isa(EntryVal)) { assert(Start->getType()->isIntegerTy() && "Truncation requires an integer type"); auto *TruncType = cast(EntryVal->getType()); Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); Value *SteppedStart = getStepVector( SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. Instruction::BinaryOps AddOp; Instruction::BinaryOps MulOp; if (Step->getType()->isIntegerTy()) { AddOp = Instruction::Add; MulOp = Instruction::Mul; } else { AddOp = ID.getInductionOpcode(); MulOp = Instruction::FMul; } // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. Type *StepType = Step->getType(); Value *RuntimeVF; if (Step->getType()->isFloatingPointTy()) RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); else RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. // // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. Value *SplatVF = isa(Mul) ? ConstantVector::getSplat(State.VF, cast(Mul)) : Builder.CreateVectorSplat(State.VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", &*State.CFG.PrevBB->getFirstInsertionPt()); VecInd->setDebugLoc(EntryVal->getDebugLoc()); Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < State.UF; ++Part) { State.set(this, LastInduction, Part); if (isa(EntryVal)) State.addMetadata(LastInduction, EntryVal); LastInduction = cast( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, VectorPH); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch // block cannot be used, as it does not exist yet. // TODO: Model increment value in VPlan, by turning the recipe into a // multi-def and a subclass of VPHeaderPHIRecipe. VecInd->addIncoming(LastInduction, VectorPH); } void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"); assert(cast(getUnderlyingInstr())->getType()->isPointerTy() && "Unexpected type."); auto *IVR = getParent()->getPlan()->getCanonicalIV(); PHINode *CanonicalIV = cast(State.get(IVR, 0)); if (onlyScalarsGenerated(State.VF)) { // This is the normalized GEP that starts counting at zero. Value *PtrInd = State.Builder.CreateSExtOrTrunc( CanonicalIV, IndDesc.getStep()->getType()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = vputils::onlyFirstLaneUsed(this); assert((IsUniform || !State.VF.isScalable()) && "Cannot scalarize a scalable VF"); unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *PartStart = createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = State.Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); Value *SclrGep = emitTransformedIndex( State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); SclrGep->setName("next.gep"); State.set(this, SclrGep, VPIteration(Part, Lane)); } } return; } Type *PhiType = IndDesc.getStep()->getType(); // Build a pointer phi Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); // A pointer induction, performed by using a gep Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); Value *NumUnrolledElems = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); Value *InductionGEP = GetElementPtrInst::Create( State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", InductionLoc); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch // block cannot be used, as it does not exist yet. // TODO: Model increment value in VPlan, by turning the recipe into a // multi-def and a subclass of VPHeaderPHIRecipe. NewPointerPhi->addIncoming(InductionGEP, VectorPH); // Create UF many actual address geps that use the pointer // phi as base and a vectorized version of the step value // () as offset. for (unsigned Part = 0; Part < State.UF; ++Part) { Type *VecPhiType = VectorType::get(PhiType, State.VF); Value *StartOffsetScalar = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); Value *StartOffset = State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); // Create a vector of consecutive numbers from zero to VF. StartOffset = State.Builder.CreateAdd( StartOffset, State.Builder.CreateStepVector(VecPhiType)); assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul( StartOffset, State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), "vector.gep")); State.set(this, GEP, Part); } } void VPDerivedIVRecipe::execute(VPTransformState &State) { assert(!State.Instance && "VPDerivedIVRecipe being replicated."); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); if (IndDesc.getInductionBinOp() && isa(IndDesc.getInductionBinOp())) State.Builder.setFastMathFlags( IndDesc.getInductionBinOp()->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPIteration(0, 0)); Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); Value *DerivedIV = emitTransformedIndex(State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, IndDesc); DerivedIV->setName("offset.idx"); if (ResultTy != DerivedIV->getType()) { assert(Step->getType()->isIntegerTy() && "Truncation requires an integer step"); DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); } assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); State.set(this, DerivedIV, VPIteration(0, 0)); } void VPScalarIVStepsRecipe::execute(VPTransformState &State) { // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); if (IndDesc.getInductionBinOp() && isa(IndDesc.getInductionBinOp())) State.Builder.setFastMathFlags( IndDesc.getInductionBinOp()->getFastMathFlags()); Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); Value *Step = State.get(getStepValue(), VPIteration(0, 0)); buildScalarSteps(BaseIV, Step, IndDesc, this, State); } void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), getStoredValues(), getMask(), NeedsMaskForGaps); } void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); Value *PrevInChain = State.get(getChainOp(), 0); RecurKind Kind = RdxDesc->getRecurrenceKind(); bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); VectorType *VecTy = cast(NewVecOp->getType()); Value *Iden = RdxDesc->getRecurrenceIdentity( Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); Value *IdenVec = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); NewVecOp = Select; } Value *NewRed; Value *NextInChain; if (IsOrdered) { if (State.VF.isVector()) NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, PrevInChain); else NewRed = State.Builder.CreateBinOp( (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, NewVecOp); PrevInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part); NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { NextInChain = createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), NewRed, PrevInChain); } else if (IsOrdered) NextInChain = NewRed; else NextInChain = State.Builder.CreateBinOp( (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, PrevInChain); State.set(this, NextInChain, Part); } } void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); // Insert scalar instance packing it into a vector. if (State.VF.isVector() && shouldPack()) { // If we're constructing lane 0, initialize to start from poison. if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( VectorType::get(UI->getType(), State.VF)); State.set(this, Poison, State.Instance->Part); } State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); } return; } if (IsUniform) { // If the recipe is uniform across all parts (instead of just per VF), only // generate a single instance. if ((isa(UI) || isa(UI)) && all_of(operands(), [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); })) { State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); if (user_begin() != user_end()) { for (unsigned Part = 1; Part < State.UF; ++Part) State.set(this, State.get(this, VPIteration(0, 0)), VPIteration(Part, 0)); } return; } // Uniform within VL means we need to generate lane 0 only for each // unrolled copy. for (unsigned Part = 0; Part < State.UF; ++Part) State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); return; } // A store of a loop varying value to a uniform address only needs the last // copy of the store. if (isa(UI) && vputils::isUniformAfterVectorization(getOperand(1))) { auto Lane = VPLane::getLastLaneForVF(State.VF); State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), State); return; } // Generate scalar instances for all VF lanes of all UF parts. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); const unsigned EndLane = State.VF.getKnownMinValue(); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; // Attempt to issue a wide load. LoadInst *LI = dyn_cast(&Ingredient); StoreInst *SI = dyn_cast(&Ingredient); assert((LI || SI) && "Invalid Load/Store instruction"); assert((!SI || StoredValue) && "No stored value provided for widened store"); assert((!LI || !StoredValue) && "Stored value provided for widened load"); Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGatherScatter = !isConsecutive(); auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); bool isMaskRequired = getMask(); if (isMaskRequired) for (unsigned Part = 0; Part < State.UF; ++Part) BlockInMaskParts[Part] = State.get(getMask(), Part); const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. Value *PartPtr = nullptr; // Use i32 for the gep index type when the value is constant, // or query DataLayout for a more suitable index type otherwise. const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) ? DL.getIndexType(ScalarDataTy->getPointerTo()) : Builder.getInt32Ty(); bool InBounds = false; if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) InBounds = gep->isInBounds(); if (isReverse()) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. // RunTimeVF = VScale * VF.getKnownMinValue() // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); // NumElt = -Part * RunTimeVF Value *NumElt = Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); // LastLane = 1 - RunTimeVF Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); PartPtr = Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); } else { Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); } unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; // Handle Stores: if (SI) { State.setDebugLocFromInst(SI); for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); } else { if (isReverse()) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } State.addMetadata(NewSI, SI); } return; } // Handle loads. assert(LI && "Must have a load instruction"); State.setDebugLocFromInst(LI); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); State.addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. State.addMetadata(NewLI, LI); if (Reverse) NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } State.set(getVPSingleValue(), NewLI, Part); } } // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable // for predication. static ScalarEpilogueLowering getScalarEpilogueLowering( Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. // (For PGSO, as shouldOptimizeForSize isn't currently accessible from // LoopAccessInfo (due to code dependency and not being able to reliably get // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection // of strides in LoopAccessInfo::analyzeLoop() and vectorize without // versioning when the vectorization is forced, unlike hasOptSize. So revert // back to the old way and vectorize with versioning when forced. See D81345.) if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, PGSOQueryType::IRPass) && Hints.getForce() != LoopVectorizeHints::FK_Enabled)) return CM_ScalarEpilogueNotAllowedOptSize; // 2) If set, obey the directives if (PreferPredicateOverEpilogue.getNumOccurrences()) { switch (PreferPredicateOverEpilogue) { case PreferPredicateTy::ScalarEpilogue: return CM_ScalarEpilogueAllowed; case PreferPredicateTy::PredicateElseScalarEpilogue: return CM_ScalarEpilogueNotNeededUsePredicate; case PreferPredicateTy::PredicateOrDontVectorize: return CM_ScalarEpilogueNotAllowedUsePredicate; }; } // 3) If set, obey the hints switch (Hints.getPredicate()) { case LoopVectorizeHints::FK_Enabled: return CM_ScalarEpilogueNotNeededUsePredicate; case LoopVectorizeHints::FK_Disabled: return CM_ScalarEpilogueAllowed; }; // 4) if the TTI hook indicates this is profitable, request predication. TailFoldingInfo TFI(TLI, &LVL, IAI); if (TTI->preferPredicateOverEpilogue(&TFI)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; } Value *VPTransformState::get(VPValue *Def, unsigned Part) { // If Values have been set for this Def return the one relevant for \p Part. if (hasVectorValue(Def, Part)) return Data.PerPartOutput[Def][Part]; auto GetBroadcastInstrs = [this, Def](Value *V) { bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); if (VF.isScalar()) return V; // Place the code for broadcasting invariant variables in the new preheader. IRBuilder<>::InsertPointGuard Guard(Builder); if (SafeToHoist) { BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast( Plan->getVectorLoopRegion()->getSinglePredecessor())]; if (LoopVectorPreHeader) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); } // Place the code for broadcasting invariant variables in the new preheader. // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); return Shuf; }; if (!hasScalarValue(Def, {Part, 0})) { Value *IRV = Def->getLiveInIRValue(); Value *B = GetBroadcastInstrs(IRV); set(Def, B, Part); return B; } Value *ScalarValue = get(Def, {Part, 0}); // If we aren't vectorizing, we can just copy the scalar map values over // to the vector map. if (VF.isScalar()) { set(Def, ScalarValue, Part); return ScalarValue; } bool IsUniform = vputils::isUniformAfterVectorization(Def); unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, {Part, LastLane})) { // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and // VPExpandSCEVRecipes can also be uniform. assert((isa(Def->getDefiningRecipe()) || isa(Def->getDefiningRecipe()) || isa(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; } auto *LastInst = cast(get(Def, {Part, LastLane})); // Set the insert point after the last scalarized instruction or after the // last PHI, if LastInst is a PHI. This ensures the insertelement sequence // will directly follow the scalar definitions. auto OldIP = Builder.saveIP(); auto NewIP = isa(LastInst) ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) : std::next(BasicBlock::iterator(LastInst)); Builder.SetInsertPoint(&*NewIP); // However, if we are vectorizing, we need to construct the vector values. // If the value is known to be uniform after vectorization, we can just // broadcast the scalar value corresponding to lane zero for each unroll // iteration. Otherwise, we construct the vector values using // insertelement instructions. Since the resulting vectors are stored in // State, we will only generate the insertelements once. Value *VectorValue = nullptr; if (IsUniform) { VectorValue = GetBroadcastInstrs(ScalarValue); set(Def, VectorValue, Part); } else { // Initialize packing with insertelements to start from undef. assert(!VF.isScalable() && "VF is assumed to be non scalable."); Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); set(Def, Undef, Part); for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); VectorValue = get(Def, Part); } Builder.restoreIP(OldIP); return VectorValue; } // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the // input LLVM IR. static bool processLoopInVPlanNativePath( Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) { if (isa(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); return false; } assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); CM.collectElementTypesForWidening(); // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. // Also, do not attempt to vectorize if no vector code will be produced. if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); } // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } // Emit a remark if there are stores to floats that required a floating point // extension. If the vectorized loop was generated with floating point there // will be a performance penalty from the conversion overhead and the change in // the vector width. static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { SmallVector Worklist; for (BasicBlock *BB : L->getBlocks()) { for (Instruction &Inst : *BB) { if (auto *S = dyn_cast(&Inst)) { if (S->getValueOperand()->getType()->isFloatTy()) Worklist.push_back(S); } } } // Traverse the floating point stores upwards searching, for floating point // conversions. SmallPtrSet Visited; SmallPtrSet EmittedRemark; while (!Worklist.empty()) { auto *I = Worklist.pop_back_val(); if (!L->contains(I)) continue; if (!Visited.insert(I).second) continue; // Emit a remark if the floating point store required a floating // point conversion. // TODO: More work could be done to identify the root cause such as a // constant or a function return type and point the user to it. if (isa(I) && EmittedRemark.insert(I).second) ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", I->getDebugLoc(), L->getHeader()) << "floating point conversion changes vector width. " << "Mixed floating point precision requires an up/down " << "cast that will negatively impact performance."; }); for (Use &Op : I->operands()) if (auto *OpI = dyn_cast(Op)) Worklist.push_back(OpI); } } static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional VScale, Loop *L, ScalarEvolution &SE) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) return false; // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. if (VF.Width.isScalar()) { if (CheckCost > VectorizeMemoryCheckThreshold) { LLVM_DEBUG( dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"); return false; } return true; } // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. double ScalarC = *VF.ScalarCost.getValue(); if (ScalarC == 0) return true; // First, compute the minimum iteration count required so that the vector // loop outperforms the scalar loop. // The total cost of the scalar loop is // ScalarC * TC // where // * TC is the actual trip count of the loop. // * ScalarC is the cost of a single scalar iteration. // // The total cost of the vector loop is // RtC + VecC * (TC / VF) + EpiC // where // * RtC is the cost of the generated runtime checks // * VecC is the cost of a single vector iteration. // * TC is the actual trip count of the loop // * VF is the vectorization factor // * EpiCost is the cost of the generated epilogue, including the cost // of the remaining scalar operations. // // Vectorization is profitable once the total vector cost is less than the // total scalar cost: // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC // // Now we can compute the minimum required trip count TC as // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC // // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. unsigned IntVF = VF.Width.getKnownMinValue(); if (VF.Width.isScalable()) { unsigned AssumedMinimumVscale = 1; if (VScale) AssumedMinimumVscale = *VScale; IntVF *= AssumedMinimumVscale; } double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; double RtC = *CheckCost.getValue(); double MinTC1 = RtC / (ScalarC - VecCOverVF); // Second, compute a minimum iteration count so that the cost of the // runtime checks is only a fraction of the total scalar loop cost. This // adds a loop-dependent bound on the overhead incurred if the runtime // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC // * TC. To bound the runtime check to be a fraction 1/X of the scalar // cost, compute // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC double MinTC2 = RtC * 10 / ScalarC; // Now pick the larger minimum. If it is not a multiple of VF, choose the // next closest multiple of VF. This should partly compensate for ignoring // the epilogue cost. uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); LLVM_DEBUG( dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" << VF.MinProfitableTripCount << "\n"); // Skip vectorization if the expected trip count is less than the minimum // required trip count. if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), VF.MinProfitableTripCount)) { LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " "trip count < minimum profitable VF (" << *ExpectedTC << " < " << VF.MinProfitableTripCount << ")\n"); return false; } } return true; } LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || !EnableLoopVectorization) {} bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" << L->getHeader()->getParent()->getName() << "' from " << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); LLVM_DEBUG( dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints::FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints::FK_Enabled ? "enabled" : "?")) << " width=" << Hints.getWidth() << " interleave=" << Hints.getInterleave() << "\n"); // Function containing loop Function *F = L->getHeader()->getParent(); // Looking at the diagnostic output is the only way to determine if a loop // was vectorized (other than looking at the IR or machine code), so it // is important to generate an optimization remark for each loop. Most of // these messages are generated as OptimizationRemarkAnalysis. Remarks // generated as OptimizationRemark and OptimizationRemarkMissed are // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } PredicatedScalarEvolution PSE(*SE, *L); // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); return false; } // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, ORE, BFI, PSI, Hints, Requirements); assert(L->isInnermost() && "Inner loop expected."); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); // If an override option has been passed in for interleaved accesses, use it. if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) UseInterleaved = EnableInterleavedMemAccesses; // Analyze interleaved memory accesses. if (UseInterleaved) IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); // Check the function attributes and profiles to find out if this function // should be optimized for size. ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. auto ExpectedTC = getSmallBestKnownTC(*SE, L); if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { LLVM_DEBUG(dbgs() << "\n"); SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } else { LLVM_DEBUG(dbgs() << " But the target considers the trip count too " "small to consider vectorizing.\n"); reportVectorizationFailure( "The trip count is below the minial threshold value.", "loop trip count is too low, avoiding vectorization", "LowTripCount", ORE, L); Hints.emitRemarkWithHints(); return false; } } } // Check the function attributes to see if implicit floats or vectors are // allowed. if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { reportVectorizationFailure( "Can't vectorize when the NoImplicitFloat attribute is used", "loop not vectorized due to NoImplicitFloat attribute", "NoImplicitFloat", ORE, L); Hints.emitRemarkWithHints(); return false; } // Check if the target supports potentially unsafe FP vectorization. // FIXME: Add a check for the type of safety issue (denormal, signaling) // for the target we're vectorizing for, to make sure none of the // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { reportVectorizationFailure( "Potentially unsafe FP op prevents vectorization", "loop not vectorized due to unsafe FP support.", "UnsafeFP", ORE, L); Hints.emitRemarkWithHints(); return false; } bool AllowOrderedReductions; // If the flag is set, use that instead and override the TTI behaviour. if (ForceOrderedReductions.getNumOccurrences() > 0) AllowOrderedReductions = ForceOrderedReductions; else AllowOrderedReductions = TTI->enableOrderedReductions(); if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { ORE->emit([&]() { auto *ExactFPMathInst = Requirements.getExactFPInst(); return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", ExactFPMathInst->getDebugLoc(), ExactFPMathInst->getParent()) << "loop not vectorized: cannot prove it is safe to reorder " "floating-point operations"; }); LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " "reorder floating-point operations\n"); Hints.emitRemarkWithHints(); return false; } // Use the cost model. LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. std::optional MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), L->getHeader()) << "loop not vectorized: cannot prove it is safe to reorder " "memory operations"; }); LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Hints.emitRemarkWithHints(); return false; } } // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", "the cost-model indicates that vectorization is not beneficial"); VectorizeLoop = false; } if (!MaybeVF && UserIC > 1) { // Tell the user interleaving was avoided up-front, despite being explicitly // requested. LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " "interleaving should be avoided up front\n"); IntDiagMsg = std::make_pair( "InterleavingAvoided", "Ignoring UserIC, because interleaving was avoided up front"); InterleaveLoop = false; } else if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = std::make_pair( "InterleavingNotBeneficial", "the cost-model indicates that interleaving is not beneficial"); InterleaveLoop = false; if (UserIC == 1) { IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; IntDiagMsg.second += " and is explicitly disabled or interleave count is set to 1"; } } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. LLVM_DEBUG( dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); IntDiagMsg = std::make_pair( "InterleavingBeneficialButDisabled", "the cost-model indicates that interleaving is beneficial " "but is explicitly disabled or interleave count is set to 1"); InterleaveLoop = false; } // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { // Do not vectorize or interleaving the loop. ORE->emit([&]() { return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, L->getStartLoc(), L->getHeader()) << VecDiagMsg.second; }); ORE->emit([&]() { return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) << IntDiagMsg.second; }); return false; } else if (!VectorizeLoop && InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, L->getStartLoc(), L->getHeader()) << VecDiagMsg.second; }); } else if (VectorizeLoop && !InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) << IntDiagMsg.second; }); } else if (VectorizeLoop && InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); { using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) << "interleaved loop (interleaved count: " << NV("InterleaveCount", IC) << ")"; }); } else { // If we decided that it is *legal* to vectorize the loop, then do it. // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = LVP.selectEpilogueVectorizationFactor(VF.Width, IC); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); // Re-use the trip count and steps expanded for the main loop, as // skeleton creation needs it as a value that dominates both the scalar // and vector epilogue loops // TODO: This is a workaround needed for epilogue vectorization and it // should be removed once induction resume value creation is done // directly in VPlan. EpilogILV.setTripCount(MainILV.getTripCount()); for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { auto *ExpandR = cast(&R); auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( ExpandedSCEVs.find(ExpandR->getSCEV())->second); ExpandR->replaceAllUsesWith(ExpandedVal); ExpandR->eraseFromParent(); } // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated // before vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { if (isa(&R)) continue; Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast(&R)) { ResumeV = MainILV.getReductionResumeValue( ReductionPhi->getRecurrenceDescriptor()); } else { // Create induction resume values for both widened pointer and // integer/fp inductions and update the start value of the induction // recipes to use the resume value. PHINode *IndPhi = nullptr; const InductionDescriptor *ID; if (auto *Ind = dyn_cast(&R)) { IndPhi = cast(Ind->getUnderlyingValue()); ID = &Ind->getInductionDescriptor(); } else { auto *WidenInd = cast(&R); IndPhi = WidenInd->getPHINode(); ID = &WidenInd->getInductionDescriptor(); } ResumeV = MainILV.createInductionResumeValue( IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), {EPI.MainLoopIterationCountCheck}); } assert(ResumeV && "Must have a resume value"); VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); cast(&R)->setStartValue(StartVal); } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, true, &ExpandedSCEVs); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there // are no runtime checks about strides and memory. A scalar loop that is // rarely used is not worth unrolling. if (!LB.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } // Report the vectorization decision. ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), L->getHeader()) << "vectorized loop (vectorization width: " << NV("VectorizationFactor", VF.Width) << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; }); } if (ORE->allowExtraAnalysis(LV_NAME)) checkMixedPrecision(L, ORE); } std::optional RemainderLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); if (RemainderLoopID) { L->setLoopID(*RemainderLoopID); } else { if (DisableRuntimeUnroll) AddRuntimeUnrollDisableMetaData(L); // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); } assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; DT = &DT_; BFI = BFI_; TLI = TLI_; AC = &AC_; LAIs = &LAIs_; DB = &DB_; ORE = &ORE_; PSI = PSI_; // Don't attempt if // 1. the target claims to have no vector registers, and // 2. interleaving won't help ILP. // // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) return LoopVectorizeResult(false, false); bool Changed = false, CFGChanged = false; // The vectorizer requires loops to be in simplified form. // Since simplification may add new inner loops, it has to run before the // legality and profitability checks. This means running the loop vectorizer // will simplify all loops, regardless of whether anything end up being // vectorized. for (const auto &L : *LI) Changed |= CFGChanged |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. SmallVector Worklist; for (Loop *L : *LI) collectSupportedLoops(*L, LI, ORE, Worklist); LoopsAnalyzed += Worklist.size(); // Now walk the identified inner loops. while (!Worklist.empty()) { Loop *L = Worklist.pop_back_val(); // For the inner loops we actually process, form LCSSA to simplify the // transform. Changed |= formLCSSARecursively(*L, *DT, LI, SE); Changed |= CFGChanged |= processLoop(L); if (Changed) LAIs->clear(); } // Process each loop nest in the function. return LoopVectorizeResult(Changed, CFGChanged); } PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult(F); // There are no loops in the function. Return before computing other expensive // analyses. if (LI.empty()) return PreservedAnalyses::all(); auto &SE = AM.getResult(F); auto &TTI = AM.getResult(F); auto &DT = AM.getResult(F); auto &TLI = AM.getResult(F); auto &AC = AM.getResult(F); auto &DB = AM.getResult(F); auto &ORE = AM.getResult(F); LoopAccessInfoManager &LAIs = AM.getResult(F); auto &MAMProxy = AM.getResult(F); ProfileSummaryInfo *PSI = MAMProxy.getCachedResult(*F.getParent()); BlockFrequencyInfo *BFI = nullptr; if (PSI && PSI->hasProfileSummary()) BFI = &AM.getResult(F); LoopVectorizeResult Result = runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; if (isAssignmentTrackingEnabled(*F.getParent())) { for (auto &BB : F) RemoveRedundantDbgInstrs(&BB); } // We currently do not preserve loopinfo/dominator analyses with outer loop // vectorization. Until this is addressed, mark these analyses as preserved // only for non-VPlan-native path. // TODO: Preserve Loop and Dominator analyses for VPlan-native path. if (!EnableVPlanNativePath) { PA.preserve(); PA.preserve(); PA.preserve(); #ifdef EXPENSIVE_CHECKS SE.verify(); #endif } if (Result.MadeCFGChange) { // Making CFG changes likely means a loop got vectorized. Indicate that // extra simplification passes should be run. // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only // be run if runtime checks have been added. AM.getResult(F); PA.preserve(); } else { PA.preserveSet(); } return PA; } void LoopVectorizePass::printPipeline( raw_ostream &OS, function_ref MapClassName2PassName) { static_cast *>(this)->printPipeline( OS, MapClassName2PassName); OS << '<'; OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; OS << '>'; }