Index: projects/clang390-import/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h =================================================================== --- projects/clang390-import/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h (revision 304769) +++ projects/clang390-import/contrib/llvm/include/llvm/Transforms/Scalar/Reassociate.h (revision 304770) @@ -1,100 +1,100 @@ //===- Reassociate.h - Reassociate binary expressions -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass reassociates commutative expressions in an order that is designed // to promote better constant propagation, GCSE, LICM, PRE, etc. // // For example: 4 + (x + 5) -> x + (4 + 5) // // In the implementation of this algorithm, constants are assigned rank = 0, // function arguments are rank = 1, and other values are assigned ranks // corresponding to the reverse post order traversal of current function // (starting at 2), which effectively gives values in deep loops higher rank // than values not in loops. // //===----------------------------------------------------------------------===// #ifndef LLVM_TRANSFORMS_SCALAR_REASSOCIATE_H #define LLVM_TRANSFORMS_SCALAR_REASSOCIATE_H #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" namespace llvm { /// A private "module" namespace for types and utilities used by Reassociate. /// These are implementation details and should not be used by clients. namespace reassociate { struct ValueEntry { unsigned Rank; Value *Op; ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {} }; inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) { return LHS.Rank > RHS.Rank; // Sort so that highest rank goes to start. } /// \brief Utility class representing a base and exponent pair which form one /// factor of some product. struct Factor { Value *Base; unsigned Power; Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} }; class XorOpnd; } /// Reassociate commutative expressions. class ReassociatePass : public PassInfoMixin { DenseMap RankMap; DenseMap, unsigned> ValueRankMap; SetVector> RedoInsts; bool MadeChange; public: PreservedAnalyses run(Function &F, FunctionAnalysisManager &); private: - void BuildRankMap(Function &F, ReversePostOrderTraversal &RPOT); + void BuildRankMap(Function &F); unsigned getRank(Value *V); void canonicalizeOperands(Instruction *I); void ReassociateExpression(BinaryOperator *I); void RewriteExprTree(BinaryOperator *I, SmallVectorImpl &Ops); Value *OptimizeExpression(BinaryOperator *I, SmallVectorImpl &Ops); Value *OptimizeAdd(Instruction *I, SmallVectorImpl &Ops); Value *OptimizeXor(Instruction *I, SmallVectorImpl &Ops); bool CombineXorOpnd(Instruction *I, reassociate::XorOpnd *Opnd1, APInt &ConstOpnd, Value *&Res); bool CombineXorOpnd(Instruction *I, reassociate::XorOpnd *Opnd1, reassociate::XorOpnd *Opnd2, APInt &ConstOpnd, Value *&Res); bool collectMultiplyFactors(SmallVectorImpl &Ops, SmallVectorImpl &Factors); Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder, SmallVectorImpl &Factors); Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl &Ops); Value *RemoveFactorFromExpression(Value *V, Value *Factor); void EraseInst(Instruction *I); void RecursivelyEraseDeadInsts(Instruction *I, SetVector> &Insts); void OptimizeInst(Instruction *I); Instruction *canonicalizeNegConstExpr(Instruction *I); }; } #endif // LLVM_TRANSFORMS_SCALAR_REASSOCIATE_H Index: projects/clang390-import/contrib/llvm/lib/Analysis/ScalarEvolution.cpp =================================================================== --- projects/clang390-import/contrib/llvm/lib/Analysis/ScalarEvolution.cpp (revision 304769) +++ projects/clang390-import/contrib/llvm/lib/Analysis/ScalarEvolution.cpp (revision 304770) @@ -1,10463 +1,10467 @@ //===- ScalarEvolution.cpp - Scalar Evolution Analysis --------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the implementation of the scalar evolution analysis // engine, which is used primarily to analyze expressions involving induction // variables in loops. // // There are several aspects to this library. First is the representation of // scalar expressions, which are represented as subclasses of the SCEV class. // These classes are used to represent certain types of subexpressions that we // can handle. We only create one SCEV of a particular shape, so // pointer-comparisons for equality are legal. // // One important aspect of the SCEV objects is that they are never cyclic, even // if there is a cycle in the dataflow for an expression (ie, a PHI node). If // the PHI node is one of the idioms that we can represent (e.g., a polynomial // recurrence) then we represent it directly as a recurrence node, otherwise we // represent it as a SCEVUnknown node. // // In addition to being able to represent expressions of various types, we also // have folders that are used to build the *canonical* representation for a // particular expression. These folders are capable of using a variety of // rewrite rules to simplify the expressions. // // Once the folders are defined, we can implement the more interesting // higher-level code, such as the code that recognizes PHI nodes of various // types, computes the execution count of a loop, etc. // // TODO: We should use these routines and value representations to implement // dependence analysis! // //===----------------------------------------------------------------------===// // // There are several good references for the techniques used in this analysis. // // Chains of recurrences -- a method to expedite the evaluation // of closed-form functions // Olaf Bachmann, Paul S. Wang, Eugene V. Zima // // On computational properties of chains of recurrences // Eugene V. Zima // // Symbolic Evaluation of Chains of Recurrences for Loop Optimization // Robert A. van Engelen // // Efficient Symbolic Analysis for Optimizing Compilers // Robert A. van Engelen // // Using the chains of recurrences algebra for data dependence testing and // induction variable substitution // MS Thesis, Johnie Birch // //===----------------------------------------------------------------------===// #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/SaveAndRestore.h" #include using namespace llvm; #define DEBUG_TYPE "scalar-evolution" STATISTIC(NumArrayLenItCounts, "Number of trip counts computed with array length"); STATISTIC(NumTripCountsComputed, "Number of loops with predictable loop counts"); STATISTIC(NumTripCountsNotComputed, "Number of loops without predictable loop counts"); STATISTIC(NumBruteForceTripCountsComputed, "Number of loops with trip counts computed by force"); static cl::opt MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden, cl::desc("Maximum number of iterations SCEV will " "symbolically execute a constant " "derived loop"), cl::init(100)); // FIXME: Enable this with EXPENSIVE_CHECKS when the test suite is clean. static cl::opt VerifySCEV("verify-scev", cl::desc("Verify ScalarEvolution's backedge taken counts (slow)")); static cl::opt VerifySCEVMap("verify-scev-maps", cl::desc("Verify no dangling value in ScalarEvolution's " "ExprValueMap (slow)")); //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Implementation of the SCEV class. // LLVM_DUMP_METHOD void SCEV::dump() const { print(dbgs()); dbgs() << '\n'; } void SCEV::print(raw_ostream &OS) const { switch (static_cast(getSCEVType())) { case scConstant: cast(this)->getValue()->printAsOperand(OS, false); return; case scTruncate: { const SCEVTruncateExpr *Trunc = cast(this); const SCEV *Op = Trunc->getOperand(); OS << "(trunc " << *Op->getType() << " " << *Op << " to " << *Trunc->getType() << ")"; return; } case scZeroExtend: { const SCEVZeroExtendExpr *ZExt = cast(this); const SCEV *Op = ZExt->getOperand(); OS << "(zext " << *Op->getType() << " " << *Op << " to " << *ZExt->getType() << ")"; return; } case scSignExtend: { const SCEVSignExtendExpr *SExt = cast(this); const SCEV *Op = SExt->getOperand(); OS << "(sext " << *Op->getType() << " " << *Op << " to " << *SExt->getType() << ")"; return; } case scAddRecExpr: { const SCEVAddRecExpr *AR = cast(this); OS << "{" << *AR->getOperand(0); for (unsigned i = 1, e = AR->getNumOperands(); i != e; ++i) OS << ",+," << *AR->getOperand(i); OS << "}<"; if (AR->hasNoUnsignedWrap()) OS << "nuw><"; if (AR->hasNoSignedWrap()) OS << "nsw><"; if (AR->hasNoSelfWrap() && !AR->getNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW))) OS << "nw><"; AR->getLoop()->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ">"; return; } case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: { const SCEVNAryExpr *NAry = cast(this); const char *OpStr = nullptr; switch (NAry->getSCEVType()) { case scAddExpr: OpStr = " + "; break; case scMulExpr: OpStr = " * "; break; case scUMaxExpr: OpStr = " umax "; break; case scSMaxExpr: OpStr = " smax "; break; } OS << "("; for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); I != E; ++I) { OS << **I; if (std::next(I) != E) OS << OpStr; } OS << ")"; switch (NAry->getSCEVType()) { case scAddExpr: case scMulExpr: if (NAry->hasNoUnsignedWrap()) OS << ""; if (NAry->hasNoSignedWrap()) OS << ""; } return; } case scUDivExpr: { const SCEVUDivExpr *UDiv = cast(this); OS << "(" << *UDiv->getLHS() << " /u " << *UDiv->getRHS() << ")"; return; } case scUnknown: { const SCEVUnknown *U = cast(this); Type *AllocTy; if (U->isSizeOf(AllocTy)) { OS << "sizeof(" << *AllocTy << ")"; return; } if (U->isAlignOf(AllocTy)) { OS << "alignof(" << *AllocTy << ")"; return; } Type *CTy; Constant *FieldNo; if (U->isOffsetOf(CTy, FieldNo)) { OS << "offsetof(" << *CTy << ", "; FieldNo->printAsOperand(OS, false); OS << ")"; return; } // Otherwise just print it normally. U->getValue()->printAsOperand(OS, false); return; } case scCouldNotCompute: OS << "***COULDNOTCOMPUTE***"; return; } llvm_unreachable("Unknown SCEV kind!"); } Type *SCEV::getType() const { switch (static_cast(getSCEVType())) { case scConstant: return cast(this)->getType(); case scTruncate: case scZeroExtend: case scSignExtend: return cast(this)->getType(); case scAddRecExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: return cast(this)->getType(); case scAddExpr: return cast(this)->getType(); case scUDivExpr: return cast(this)->getType(); case scUnknown: return cast(this)->getType(); case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } bool SCEV::isZero() const { if (const SCEVConstant *SC = dyn_cast(this)) return SC->getValue()->isZero(); return false; } bool SCEV::isOne() const { if (const SCEVConstant *SC = dyn_cast(this)) return SC->getValue()->isOne(); return false; } bool SCEV::isAllOnesValue() const { if (const SCEVConstant *SC = dyn_cast(this)) return SC->getValue()->isAllOnesValue(); return false; } bool SCEV::isNonConstantNegative() const { const SCEVMulExpr *Mul = dyn_cast(this); if (!Mul) return false; // If there is a constant factor, it will be first. const SCEVConstant *SC = dyn_cast(Mul->getOperand(0)); if (!SC) return false; // Return true if the value is negative, this matches things like (-42 * V). return SC->getAPInt().isNegative(); } SCEVCouldNotCompute::SCEVCouldNotCompute() : SCEV(FoldingSetNodeIDRef(), scCouldNotCompute) {} bool SCEVCouldNotCompute::classof(const SCEV *S) { return S->getSCEVType() == scCouldNotCompute; } const SCEV *ScalarEvolution::getConstant(ConstantInt *V) { FoldingSetNodeID ID; ID.AddInteger(scConstant); ID.AddPointer(V); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVConstant(ID.Intern(SCEVAllocator), V); UniqueSCEVs.InsertNode(S, IP); return S; } const SCEV *ScalarEvolution::getConstant(const APInt &Val) { return getConstant(ConstantInt::get(getContext(), Val)); } const SCEV * ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) { IntegerType *ITy = cast(getEffectiveSCEVType(Ty)); return getConstant(ConstantInt::get(ITy, V, isSigned)); } SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID, unsigned SCEVTy, const SCEV *op, Type *ty) : SCEV(ID, SCEVTy), Op(op), Ty(ty) {} SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID, const SCEV *op, Type *ty) : SCEVCastExpr(ID, scTruncate, op, ty) { assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot truncate non-integer value!"); } SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID, const SCEV *op, Type *ty) : SCEVCastExpr(ID, scZeroExtend, op, ty) { assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot zero extend non-integer value!"); } SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID, const SCEV *op, Type *ty) : SCEVCastExpr(ID, scSignExtend, op, ty) { assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot sign extend non-integer value!"); } void SCEVUnknown::deleted() { // Clear this SCEVUnknown from various maps. SE->forgetMemoizedResults(this); // Remove this SCEVUnknown from the uniquing map. SE->UniqueSCEVs.RemoveNode(this); // Release the value. setValPtr(nullptr); } void SCEVUnknown::allUsesReplacedWith(Value *New) { // Clear this SCEVUnknown from various maps. SE->forgetMemoizedResults(this); // Remove this SCEVUnknown from the uniquing map. SE->UniqueSCEVs.RemoveNode(this); // Update this SCEVUnknown to point to the new value. This is needed // because there may still be outstanding SCEVs which still point to // this SCEVUnknown. setValPtr(New); } bool SCEVUnknown::isSizeOf(Type *&AllocTy) const { if (ConstantExpr *VCE = dyn_cast(getValue())) if (VCE->getOpcode() == Instruction::PtrToInt) if (ConstantExpr *CE = dyn_cast(VCE->getOperand(0))) if (CE->getOpcode() == Instruction::GetElementPtr && CE->getOperand(0)->isNullValue() && CE->getNumOperands() == 2) if (ConstantInt *CI = dyn_cast(CE->getOperand(1))) if (CI->isOne()) { AllocTy = cast(CE->getOperand(0)->getType()) ->getElementType(); return true; } return false; } bool SCEVUnknown::isAlignOf(Type *&AllocTy) const { if (ConstantExpr *VCE = dyn_cast(getValue())) if (VCE->getOpcode() == Instruction::PtrToInt) if (ConstantExpr *CE = dyn_cast(VCE->getOperand(0))) if (CE->getOpcode() == Instruction::GetElementPtr && CE->getOperand(0)->isNullValue()) { Type *Ty = cast(CE->getOperand(0)->getType())->getElementType(); if (StructType *STy = dyn_cast(Ty)) if (!STy->isPacked() && CE->getNumOperands() == 3 && CE->getOperand(1)->isNullValue()) { if (ConstantInt *CI = dyn_cast(CE->getOperand(2))) if (CI->isOne() && STy->getNumElements() == 2 && STy->getElementType(0)->isIntegerTy(1)) { AllocTy = STy->getElementType(1); return true; } } } return false; } bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const { if (ConstantExpr *VCE = dyn_cast(getValue())) if (VCE->getOpcode() == Instruction::PtrToInt) if (ConstantExpr *CE = dyn_cast(VCE->getOperand(0))) if (CE->getOpcode() == Instruction::GetElementPtr && CE->getNumOperands() == 3 && CE->getOperand(0)->isNullValue() && CE->getOperand(1)->isNullValue()) { Type *Ty = cast(CE->getOperand(0)->getType())->getElementType(); // Ignore vector types here so that ScalarEvolutionExpander doesn't // emit getelementptrs that index into vectors. if (Ty->isStructTy() || Ty->isArrayTy()) { CTy = Ty; FieldNo = CE->getOperand(2); return true; } } return false; } //===----------------------------------------------------------------------===// // SCEV Utilities //===----------------------------------------------------------------------===// namespace { /// SCEVComplexityCompare - Return true if the complexity of the LHS is less /// than the complexity of the RHS. This comparator is used to canonicalize /// expressions. class SCEVComplexityCompare { const LoopInfo *const LI; public: explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {} // Return true or false if LHS is less than, or at least RHS, respectively. bool operator()(const SCEV *LHS, const SCEV *RHS) const { return compare(LHS, RHS) < 0; } // Return negative, zero, or positive, if LHS is less than, equal to, or // greater than RHS, respectively. A three-way result allows recursive // comparisons to be more efficient. int compare(const SCEV *LHS, const SCEV *RHS) const { // Fast-path: SCEVs are uniqued so we can do a quick equality check. if (LHS == RHS) return 0; // Primarily, sort the SCEVs by their getSCEVType(). unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType(); if (LType != RType) return (int)LType - (int)RType; // Aside from the getSCEVType() ordering, the particular ordering // isn't very important except that it's beneficial to be consistent, // so that (a + b) and (b + a) don't end up as different expressions. switch (static_cast(LType)) { case scUnknown: { const SCEVUnknown *LU = cast(LHS); const SCEVUnknown *RU = cast(RHS); // Sort SCEVUnknown values with some loose heuristics. TODO: This is // not as complete as it could be. const Value *LV = LU->getValue(), *RV = RU->getValue(); // Order pointer values after integer values. This helps SCEVExpander // form GEPs. bool LIsPointer = LV->getType()->isPointerTy(), RIsPointer = RV->getType()->isPointerTy(); if (LIsPointer != RIsPointer) return (int)LIsPointer - (int)RIsPointer; // Compare getValueID values. unsigned LID = LV->getValueID(), RID = RV->getValueID(); if (LID != RID) return (int)LID - (int)RID; // Sort arguments by their position. if (const Argument *LA = dyn_cast(LV)) { const Argument *RA = cast(RV); unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo(); return (int)LArgNo - (int)RArgNo; } // For instructions, compare their loop depth, and their operand // count. This is pretty loose. if (const Instruction *LInst = dyn_cast(LV)) { const Instruction *RInst = cast(RV); // Compare loop depths. const BasicBlock *LParent = LInst->getParent(), *RParent = RInst->getParent(); if (LParent != RParent) { unsigned LDepth = LI->getLoopDepth(LParent), RDepth = LI->getLoopDepth(RParent); if (LDepth != RDepth) return (int)LDepth - (int)RDepth; } // Compare the number of operands. unsigned LNumOps = LInst->getNumOperands(), RNumOps = RInst->getNumOperands(); return (int)LNumOps - (int)RNumOps; } return 0; } case scConstant: { const SCEVConstant *LC = cast(LHS); const SCEVConstant *RC = cast(RHS); // Compare constant values. const APInt &LA = LC->getAPInt(); const APInt &RA = RC->getAPInt(); unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth(); if (LBitWidth != RBitWidth) return (int)LBitWidth - (int)RBitWidth; return LA.ult(RA) ? -1 : 1; } case scAddRecExpr: { const SCEVAddRecExpr *LA = cast(LHS); const SCEVAddRecExpr *RA = cast(RHS); // Compare addrec loop depths. const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop(); if (LLoop != RLoop) { unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth(); if (LDepth != RDepth) return (int)LDepth - (int)RDepth; } // Addrec complexity grows with operand count. unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands(); if (LNumOps != RNumOps) return (int)LNumOps - (int)RNumOps; // Lexicographically compare. for (unsigned i = 0; i != LNumOps; ++i) { long X = compare(LA->getOperand(i), RA->getOperand(i)); if (X != 0) return X; } return 0; } case scAddExpr: case scMulExpr: case scSMaxExpr: case scUMaxExpr: { const SCEVNAryExpr *LC = cast(LHS); const SCEVNAryExpr *RC = cast(RHS); // Lexicographically compare n-ary expressions. unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands(); if (LNumOps != RNumOps) return (int)LNumOps - (int)RNumOps; for (unsigned i = 0; i != LNumOps; ++i) { if (i >= RNumOps) return 1; long X = compare(LC->getOperand(i), RC->getOperand(i)); if (X != 0) return X; } return (int)LNumOps - (int)RNumOps; } case scUDivExpr: { const SCEVUDivExpr *LC = cast(LHS); const SCEVUDivExpr *RC = cast(RHS); // Lexicographically compare udiv expressions. long X = compare(LC->getLHS(), RC->getLHS()); if (X != 0) return X; return compare(LC->getRHS(), RC->getRHS()); } case scTruncate: case scZeroExtend: case scSignExtend: { const SCEVCastExpr *LC = cast(LHS); const SCEVCastExpr *RC = cast(RHS); // Compare cast expressions by operand. return compare(LC->getOperand(), RC->getOperand()); } case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } }; } // end anonymous namespace /// Given a list of SCEV objects, order them by their complexity, and group /// objects of the same complexity together by value. When this routine is /// finished, we know that any duplicates in the vector are consecutive and that /// complexity is monotonically increasing. /// /// Note that we go take special precautions to ensure that we get deterministic /// results from this routine. In other words, we don't want the results of /// this to depend on where the addresses of various SCEV objects happened to /// land in memory. /// static void GroupByComplexity(SmallVectorImpl &Ops, LoopInfo *LI) { if (Ops.size() < 2) return; // Noop if (Ops.size() == 2) { // This is the common case, which also happens to be trivially simple. // Special case it. const SCEV *&LHS = Ops[0], *&RHS = Ops[1]; if (SCEVComplexityCompare(LI)(RHS, LHS)) std::swap(LHS, RHS); return; } // Do the rough sort by complexity. std::stable_sort(Ops.begin(), Ops.end(), SCEVComplexityCompare(LI)); // Now that we are sorted by complexity, group elements of the same // complexity. Note that this is, at worst, N^2, but the vector is likely to // be extremely short in practice. Note that we take this approach because we // do not want to depend on the addresses of the objects we are grouping. for (unsigned i = 0, e = Ops.size(); i != e-2; ++i) { const SCEV *S = Ops[i]; unsigned Complexity = S->getSCEVType(); // If there are any objects of the same complexity and same value as this // one, group them. for (unsigned j = i+1; j != e && Ops[j]->getSCEVType() == Complexity; ++j) { if (Ops[j] == S) { // Found a duplicate. // Move it to immediately after i'th element. std::swap(Ops[i+1], Ops[j]); ++i; // no need to rescan it. if (i == e-2) return; // Done! } } } } // Returns the size of the SCEV S. static inline int sizeOfSCEV(const SCEV *S) { struct FindSCEVSize { int Size; FindSCEVSize() : Size(0) {} bool follow(const SCEV *S) { ++Size; // Keep looking at all operands of S. return true; } bool isDone() const { return false; } }; FindSCEVSize F; SCEVTraversal ST(F); ST.visitAll(S); return F.Size; } namespace { struct SCEVDivision : public SCEVVisitor { public: // Computes the Quotient and Remainder of the division of Numerator by // Denominator. static void divide(ScalarEvolution &SE, const SCEV *Numerator, const SCEV *Denominator, const SCEV **Quotient, const SCEV **Remainder) { assert(Numerator && Denominator && "Uninitialized SCEV"); SCEVDivision D(SE, Numerator, Denominator); // Check for the trivial case here to avoid having to check for it in the // rest of the code. if (Numerator == Denominator) { *Quotient = D.One; *Remainder = D.Zero; return; } if (Numerator->isZero()) { *Quotient = D.Zero; *Remainder = D.Zero; return; } // A simple case when N/1. The quotient is N. if (Denominator->isOne()) { *Quotient = Numerator; *Remainder = D.Zero; return; } // Split the Denominator when it is a product. if (const SCEVMulExpr *T = dyn_cast(Denominator)) { const SCEV *Q, *R; *Quotient = Numerator; for (const SCEV *Op : T->operands()) { divide(SE, *Quotient, Op, &Q, &R); *Quotient = Q; // Bail out when the Numerator is not divisible by one of the terms of // the Denominator. if (!R->isZero()) { *Quotient = D.Zero; *Remainder = Numerator; return; } } *Remainder = D.Zero; return; } D.visit(Numerator); *Quotient = D.Quotient; *Remainder = D.Remainder; } // Except in the trivial case described above, we do not know how to divide // Expr by Denominator for the following functions with empty implementation. void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {} void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {} void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {} void visitUDivExpr(const SCEVUDivExpr *Numerator) {} void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {} void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {} void visitUnknown(const SCEVUnknown *Numerator) {} void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {} void visitConstant(const SCEVConstant *Numerator) { if (const SCEVConstant *D = dyn_cast(Denominator)) { APInt NumeratorVal = Numerator->getAPInt(); APInt DenominatorVal = D->getAPInt(); uint32_t NumeratorBW = NumeratorVal.getBitWidth(); uint32_t DenominatorBW = DenominatorVal.getBitWidth(); if (NumeratorBW > DenominatorBW) DenominatorVal = DenominatorVal.sext(NumeratorBW); else if (NumeratorBW < DenominatorBW) NumeratorVal = NumeratorVal.sext(DenominatorBW); APInt QuotientVal(NumeratorVal.getBitWidth(), 0); APInt RemainderVal(NumeratorVal.getBitWidth(), 0); APInt::sdivrem(NumeratorVal, DenominatorVal, QuotientVal, RemainderVal); Quotient = SE.getConstant(QuotientVal); Remainder = SE.getConstant(RemainderVal); return; } } void visitAddRecExpr(const SCEVAddRecExpr *Numerator) { const SCEV *StartQ, *StartR, *StepQ, *StepR; if (!Numerator->isAffine()) return cannotDivide(Numerator); divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR); divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR); // Bail out if the types do not match. Type *Ty = Denominator->getType(); if (Ty != StartQ->getType() || Ty != StartR->getType() || Ty != StepQ->getType() || Ty != StepR->getType()) return cannotDivide(Numerator); Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(), Numerator->getNoWrapFlags()); Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(), Numerator->getNoWrapFlags()); } void visitAddExpr(const SCEVAddExpr *Numerator) { SmallVector Qs, Rs; Type *Ty = Denominator->getType(); for (const SCEV *Op : Numerator->operands()) { const SCEV *Q, *R; divide(SE, Op, Denominator, &Q, &R); // Bail out if types do not match. if (Ty != Q->getType() || Ty != R->getType()) return cannotDivide(Numerator); Qs.push_back(Q); Rs.push_back(R); } if (Qs.size() == 1) { Quotient = Qs[0]; Remainder = Rs[0]; return; } Quotient = SE.getAddExpr(Qs); Remainder = SE.getAddExpr(Rs); } void visitMulExpr(const SCEVMulExpr *Numerator) { SmallVector Qs; Type *Ty = Denominator->getType(); bool FoundDenominatorTerm = false; for (const SCEV *Op : Numerator->operands()) { // Bail out if types do not match. if (Ty != Op->getType()) return cannotDivide(Numerator); if (FoundDenominatorTerm) { Qs.push_back(Op); continue; } // Check whether Denominator divides one of the product operands. const SCEV *Q, *R; divide(SE, Op, Denominator, &Q, &R); if (!R->isZero()) { Qs.push_back(Op); continue; } // Bail out if types do not match. if (Ty != Q->getType()) return cannotDivide(Numerator); FoundDenominatorTerm = true; Qs.push_back(Q); } if (FoundDenominatorTerm) { Remainder = Zero; if (Qs.size() == 1) Quotient = Qs[0]; else Quotient = SE.getMulExpr(Qs); return; } if (!isa(Denominator)) return cannotDivide(Numerator); // The Remainder is obtained by replacing Denominator by 0 in Numerator. ValueToValueMap RewriteMap; RewriteMap[cast(Denominator)->getValue()] = cast(Zero)->getValue(); Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true); if (Remainder->isZero()) { // The Quotient is obtained by replacing Denominator by 1 in Numerator. RewriteMap[cast(Denominator)->getValue()] = cast(One)->getValue(); Quotient = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true); return; } // Quotient is (Numerator - Remainder) divided by Denominator. const SCEV *Q, *R; const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder); // This SCEV does not seem to simplify: fail the division here. if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) return cannotDivide(Numerator); divide(SE, Diff, Denominator, &Q, &R); if (R != Zero) return cannotDivide(Numerator); Quotient = Q; } private: SCEVDivision(ScalarEvolution &S, const SCEV *Numerator, const SCEV *Denominator) : SE(S), Denominator(Denominator) { Zero = SE.getZero(Denominator->getType()); One = SE.getOne(Denominator->getType()); // We generally do not know how to divide Expr by Denominator. We // initialize the division to a "cannot divide" state to simplify the rest // of the code. cannotDivide(Numerator); } // Convenience function for giving up on the division. We set the quotient to // be equal to zero and the remainder to be equal to the numerator. void cannotDivide(const SCEV *Numerator) { Quotient = Zero; Remainder = Numerator; } ScalarEvolution &SE; const SCEV *Denominator, *Quotient, *Remainder, *Zero, *One; }; } //===----------------------------------------------------------------------===// // Simple SCEV method implementations //===----------------------------------------------------------------------===// /// Compute BC(It, K). The result has width W. Assume, K > 0. static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K, ScalarEvolution &SE, Type *ResultTy) { // Handle the simplest case efficiently. if (K == 1) return SE.getTruncateOrZeroExtend(It, ResultTy); // We are using the following formula for BC(It, K): // // BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / K! // // Suppose, W is the bitwidth of the return value. We must be prepared for // overflow. Hence, we must assure that the result of our computation is // equal to the accurate one modulo 2^W. Unfortunately, division isn't // safe in modular arithmetic. // // However, this code doesn't use exactly that formula; the formula it uses // is something like the following, where T is the number of factors of 2 in // K! (i.e. trailing zeros in the binary representation of K!), and ^ is // exponentiation: // // BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / 2^T / (K! / 2^T) // // This formula is trivially equivalent to the previous formula. However, // this formula can be implemented much more efficiently. The trick is that // K! / 2^T is odd, and exact division by an odd number *is* safe in modular // arithmetic. To do exact division in modular arithmetic, all we have // to do is multiply by the inverse. Therefore, this step can be done at // width W. // // The next issue is how to safely do the division by 2^T. The way this // is done is by doing the multiplication step at a width of at least W + T // bits. This way, the bottom W+T bits of the product are accurate. Then, // when we perform the division by 2^T (which is equivalent to a right shift // by T), the bottom W bits are accurate. Extra bits are okay; they'll get // truncated out after the division by 2^T. // // In comparison to just directly using the first formula, this technique // is much more efficient; using the first formula requires W * K bits, // but this formula less than W + K bits. Also, the first formula requires // a division step, whereas this formula only requires multiplies and shifts. // // It doesn't matter whether the subtraction step is done in the calculation // width or the input iteration count's width; if the subtraction overflows, // the result must be zero anyway. We prefer here to do it in the width of // the induction variable because it helps a lot for certain cases; CodeGen // isn't smart enough to ignore the overflow, which leads to much less // efficient code if the width of the subtraction is wider than the native // register width. // // (It's possible to not widen at all by pulling out factors of 2 before // the multiplication; for example, K=2 can be calculated as // It/2*(It+(It*INT_MIN/INT_MIN)+-1). However, it requires // extra arithmetic, so it's not an obvious win, and it gets // much more complicated for K > 3.) // Protection from insane SCEVs; this bound is conservative, // but it probably doesn't matter. if (K > 1000) return SE.getCouldNotCompute(); unsigned W = SE.getTypeSizeInBits(ResultTy); // Calculate K! / 2^T and T; we divide out the factors of two before // multiplying for calculating K! / 2^T to avoid overflow. // Other overflow doesn't matter because we only care about the bottom // W bits of the result. APInt OddFactorial(W, 1); unsigned T = 1; for (unsigned i = 3; i <= K; ++i) { APInt Mult(W, i); unsigned TwoFactors = Mult.countTrailingZeros(); T += TwoFactors; Mult = Mult.lshr(TwoFactors); OddFactorial *= Mult; } // We need at least W + T bits for the multiplication step unsigned CalculationBits = W + T; // Calculate 2^T, at width T+W. APInt DivFactor = APInt::getOneBitSet(CalculationBits, T); // Calculate the multiplicative inverse of K! / 2^T; // this multiplication factor will perform the exact division by // K! / 2^T. APInt Mod = APInt::getSignedMinValue(W+1); APInt MultiplyFactor = OddFactorial.zext(W+1); MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod); MultiplyFactor = MultiplyFactor.trunc(W); // Calculate the product, at width T+W IntegerType *CalculationTy = IntegerType::get(SE.getContext(), CalculationBits); const SCEV *Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy); for (unsigned i = 1; i != K; ++i) { const SCEV *S = SE.getMinusSCEV(It, SE.getConstant(It->getType(), i)); Dividend = SE.getMulExpr(Dividend, SE.getTruncateOrZeroExtend(S, CalculationTy)); } // Divide by 2^T const SCEV *DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor)); // Truncate the result, and divide by K! / 2^T. return SE.getMulExpr(SE.getConstant(MultiplyFactor), SE.getTruncateOrZeroExtend(DivResult, ResultTy)); } /// Return the value of this chain of recurrences at the specified iteration /// number. We can evaluate this recurrence by multiplying each element in the /// chain by the binomial coefficient corresponding to it. In other words, we /// can evaluate {A,+,B,+,C,+,D} as: /// /// A*BC(It, 0) + B*BC(It, 1) + C*BC(It, 2) + D*BC(It, 3) /// /// where BC(It, k) stands for binomial coefficient. /// const SCEV *SCEVAddRecExpr::evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const { const SCEV *Result = getStart(); for (unsigned i = 1, e = getNumOperands(); i != e; ++i) { // The computation is correct in the face of overflow provided that the // multiplication is performed _after_ the evaluation of the binomial // coefficient. const SCEV *Coeff = BinomialCoefficient(It, i, SE, getType()); if (isa(Coeff)) return Coeff; Result = SE.getAddExpr(Result, SE.getMulExpr(getOperand(i), Coeff)); } return Result; } //===----------------------------------------------------------------------===// // SCEV Expression folder implementations //===----------------------------------------------------------------------===// const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty) { assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) && "This is not a truncating conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); Ty = getEffectiveSCEVType(Ty); FoldingSetNodeID ID; ID.AddInteger(scTruncate); ID.AddPointer(Op); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast(Op)) return getConstant( cast(ConstantExpr::getTrunc(SC->getValue(), Ty))); // trunc(trunc(x)) --> trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast(Op)) return getTruncateExpr(ST->getOperand(), Ty); // trunc(sext(x)) --> sext(x) if widening or trunc(x) if narrowing if (const SCEVSignExtendExpr *SS = dyn_cast(Op)) return getTruncateOrSignExtend(SS->getOperand(), Ty); // trunc(zext(x)) --> zext(x) if widening or trunc(x) if narrowing if (const SCEVZeroExtendExpr *SZ = dyn_cast(Op)) return getTruncateOrZeroExtend(SZ->getOperand(), Ty); // trunc(x1+x2+...+xN) --> trunc(x1)+trunc(x2)+...+trunc(xN) if we can // eliminate all the truncates, or we replace other casts with truncates. if (const SCEVAddExpr *SA = dyn_cast(Op)) { SmallVector Operands; bool hasTrunc = false; for (unsigned i = 0, e = SA->getNumOperands(); i != e && !hasTrunc; ++i) { const SCEV *S = getTruncateExpr(SA->getOperand(i), Ty); if (!isa(SA->getOperand(i))) hasTrunc = isa(S); Operands.push_back(S); } if (!hasTrunc) return getAddExpr(Operands); UniqueSCEVs.FindNodeOrInsertPos(ID, IP); // Mutates IP, returns NULL. } // trunc(x1*x2*...*xN) --> trunc(x1)*trunc(x2)*...*trunc(xN) if we can // eliminate all the truncates, or we replace other casts with truncates. if (const SCEVMulExpr *SM = dyn_cast(Op)) { SmallVector Operands; bool hasTrunc = false; for (unsigned i = 0, e = SM->getNumOperands(); i != e && !hasTrunc; ++i) { const SCEV *S = getTruncateExpr(SM->getOperand(i), Ty); if (!isa(SM->getOperand(i))) hasTrunc = isa(S); Operands.push_back(S); } if (!hasTrunc) return getMulExpr(Operands); UniqueSCEVs.FindNodeOrInsertPos(ID, IP); // Mutates IP, returns NULL. } // If the input value is a chrec scev, truncate the chrec's operands. if (const SCEVAddRecExpr *AddRec = dyn_cast(Op)) { SmallVector Operands; for (const SCEV *Op : AddRec->operands()) Operands.push_back(getTruncateExpr(Op, Ty)); return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap); } // The cast wasn't folded; create an explicit cast node. We can reuse // the existing insert position since if we get here, we won't have // made any changes which would invalidate it. SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); return S; } // Get the limit of a recurrence such that incrementing by Step cannot cause // signed overflow as long as the value of the recurrence within the // loop does not exceed this limit before incrementing. static const SCEV *getSignedOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { unsigned BitWidth = SE->getTypeSizeInBits(Step->getType()); if (SE->isKnownPositive(Step)) { *Pred = ICmpInst::ICMP_SLT; return SE->getConstant(APInt::getSignedMinValue(BitWidth) - SE->getSignedRange(Step).getSignedMax()); } if (SE->isKnownNegative(Step)) { *Pred = ICmpInst::ICMP_SGT; return SE->getConstant(APInt::getSignedMaxValue(BitWidth) - SE->getSignedRange(Step).getSignedMin()); } return nullptr; } // Get the limit of a recurrence such that incrementing by Step cannot cause // unsigned overflow as long as the value of the recurrence within the loop does // not exceed this limit before incrementing. static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { unsigned BitWidth = SE->getTypeSizeInBits(Step->getType()); *Pred = ICmpInst::ICMP_ULT; return SE->getConstant(APInt::getMinValue(BitWidth) - SE->getUnsignedRange(Step).getUnsignedMax()); } namespace { struct ExtendOpTraitsBase { typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *); }; // Used to make code generic over signed and unsigned overflow. template struct ExtendOpTraits { // Members present: // // static const SCEV::NoWrapFlags WrapType; // // static const ExtendOpTraitsBase::GetExtendExprTy GetExtendExpr; // // static const SCEV *getOverflowLimitForStep(const SCEV *Step, // ICmpInst::Predicate *Pred, // ScalarEvolution *SE); }; template <> struct ExtendOpTraits : public ExtendOpTraitsBase { static const SCEV::NoWrapFlags WrapType = SCEV::FlagNSW; static const GetExtendExprTy GetExtendExpr; static const SCEV *getOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { return getSignedOverflowLimitForStep(Step, Pred, SE); } }; const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits< SCEVSignExtendExpr>::GetExtendExpr = &ScalarEvolution::getSignExtendExpr; template <> struct ExtendOpTraits : public ExtendOpTraitsBase { static const SCEV::NoWrapFlags WrapType = SCEV::FlagNUW; static const GetExtendExprTy GetExtendExpr; static const SCEV *getOverflowLimitForStep(const SCEV *Step, ICmpInst::Predicate *Pred, ScalarEvolution *SE) { return getUnsignedOverflowLimitForStep(Step, Pred, SE); } }; const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits< SCEVZeroExtendExpr>::GetExtendExpr = &ScalarEvolution::getZeroExtendExpr; } // The recurrence AR has been shown to have no signed/unsigned wrap or something // close to it. Typically, if we can prove NSW/NUW for AR, then we can just as // easily prove NSW/NUW for its preincrement or postincrement sibling. This // allows normalizing a sign/zero extended AddRec as such: {sext/zext(Step + // Start),+,Step} => {(Step + sext/zext(Start),+,Step} As a result, the // expression "Step + sext/zext(PreIncAR)" is congruent with // "sext/zext(PostIncAR)" template static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, ScalarEvolution *SE) { auto WrapType = ExtendOpTraits::WrapType; auto GetExtendExpr = ExtendOpTraits::GetExtendExpr; const Loop *L = AR->getLoop(); const SCEV *Start = AR->getStart(); const SCEV *Step = AR->getStepRecurrence(*SE); // Check for a simple looking step prior to loop entry. const SCEVAddExpr *SA = dyn_cast(Start); if (!SA) return nullptr; // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV // subtraction is expensive. For this purpose, perform a quick and dirty // difference, by checking for Step in the operand list. SmallVector DiffOps; for (const SCEV *Op : SA->operands()) if (Op != Step) DiffOps.push_back(Op); if (DiffOps.size() == SA->getNumOperands()) return nullptr; // Try to prove `WrapType` (SCEV::FlagNSW or SCEV::FlagNUW) on `PreStart` + // `Step`: // 1. NSW/NUW flags on the step increment. auto PreStartFlags = ScalarEvolution::maskFlags(SA->getNoWrapFlags(), SCEV::FlagNUW); const SCEV *PreStart = SE->getAddExpr(DiffOps, PreStartFlags); const SCEVAddRecExpr *PreAR = dyn_cast( SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap)); // "{S,+,X} is /" and "the backedge is taken at least once" implies // "S+X does not sign/unsign-overflow". // const SCEV *BECount = SE->getBackedgeTakenCount(L); if (PreAR && PreAR->getNoWrapFlags(WrapType) && !isa(BECount) && SE->isKnownPositive(BECount)) return PreStart; // 2. Direct overflow check on the step operation's expression. unsigned BitWidth = SE->getTypeSizeInBits(AR->getType()); Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2); const SCEV *OperandExtendedStart = SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy), (SE->*GetExtendExpr)(Step, WideTy)); if ((SE->*GetExtendExpr)(Start, WideTy) == OperandExtendedStart) { if (PreAR && AR->getNoWrapFlags(WrapType)) { // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then // `PreAR` == {`PreStart`,+,`Step`} is also `WrapType`. Cache this fact. const_cast(PreAR)->setNoWrapFlags(WrapType); } return PreStart; } // 3. Loop precondition. ICmpInst::Predicate Pred; const SCEV *OverflowLimit = ExtendOpTraits::getOverflowLimitForStep(Step, &Pred, SE); if (OverflowLimit && SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) return PreStart; return nullptr; } // Get the normalized zero or sign extended expression for this AddRec's Start. template static const SCEV *getExtendAddRecStart(const SCEVAddRecExpr *AR, Type *Ty, ScalarEvolution *SE) { auto GetExtendExpr = ExtendOpTraits::GetExtendExpr; const SCEV *PreStart = getPreStartForExtend(AR, Ty, SE); if (!PreStart) return (SE->*GetExtendExpr)(AR->getStart(), Ty); return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty), (SE->*GetExtendExpr)(PreStart, Ty)); } // Try to prove away overflow by looking at "nearby" add recurrences. A // motivating example for this rule: if we know `{0,+,4}` is `ult` `-1` and it // does not itself wrap then we can conclude that `{1,+,4}` is `nuw`. // // Formally: // // {S,+,X} == {S-T,+,X} + T // => Ext({S,+,X}) == Ext({S-T,+,X} + T) // // If ({S-T,+,X} + T) does not overflow ... (1) // // RHS == Ext({S-T,+,X} + T) == Ext({S-T,+,X}) + Ext(T) // // If {S-T,+,X} does not overflow ... (2) // // RHS == Ext({S-T,+,X}) + Ext(T) == {Ext(S-T),+,Ext(X)} + Ext(T) // == {Ext(S-T)+Ext(T),+,Ext(X)} // // If (S-T)+T does not overflow ... (3) // // RHS == {Ext(S-T)+Ext(T),+,Ext(X)} == {Ext(S-T+T),+,Ext(X)} // == {Ext(S),+,Ext(X)} == LHS // // Thus, if (1), (2) and (3) are true for some T, then // Ext({S,+,X}) == {Ext(S),+,Ext(X)} // // (3) is implied by (1) -- "(S-T)+T does not overflow" is simply "({S-T,+,X}+T) // does not overflow" restricted to the 0th iteration. Therefore we only need // to check for (1) and (2). // // In the current context, S is `Start`, X is `Step`, Ext is `ExtendOpTy` and T // is `Delta` (defined below). // template bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start, const SCEV *Step, const Loop *L) { auto WrapType = ExtendOpTraits::WrapType; // We restrict `Start` to a constant to prevent SCEV from spending too much // time here. It is correct (but more expensive) to continue with a // non-constant `Start` and do a general SCEV subtraction to compute // `PreStart` below. // const SCEVConstant *StartC = dyn_cast(Start); if (!StartC) return false; APInt StartAI = StartC->getAPInt(); for (unsigned Delta : {-2, -1, 1, 2}) { const SCEV *PreStart = getConstant(StartAI - Delta); FoldingSetNodeID ID; ID.AddInteger(scAddRecExpr); ID.AddPointer(PreStart); ID.AddPointer(Step); ID.AddPointer(L); void *IP = nullptr; const auto *PreAR = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); // Give up if we don't already have the add recurrence we need because // actually constructing an add recurrence is relatively expensive. if (PreAR && PreAR->getNoWrapFlags(WrapType)) { // proves (2) const SCEV *DeltaS = getConstant(StartC->getType(), Delta); ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; const SCEV *Limit = ExtendOpTraits::getOverflowLimitForStep( DeltaS, &Pred, this); if (Limit && isKnownPredicate(Pred, PreAR, Limit)) // proves (1) return true; } } return false; } const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); Ty = getEffectiveSCEVType(Ty); // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast(Op)) return getConstant( cast(ConstantExpr::getZExt(SC->getValue(), Ty))); // zext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast(Op)) return getZeroExtendExpr(SZ->getOperand(), Ty); // Before doing any expensive analysis, check to see if we've already // computed a SCEV for this Op and Ty. FoldingSetNodeID ID; ID.AddInteger(scZeroExtend); ID.AddPointer(Op); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // zext(trunc(x)) --> zext(x) or x or trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast(Op)) { // It's possible the bits taken off by the truncate were all zero bits. If // so, we should be able to simplify this further. const SCEV *X = ST->getOperand(); ConstantRange CR = getUnsignedRange(X); unsigned TruncBits = getTypeSizeInBits(ST->getType()); unsigned NewBits = getTypeSizeInBits(Ty); if (CR.truncate(TruncBits).zeroExtend(NewBits).contains( CR.zextOrTrunc(NewBits))) return getTruncateOrZeroExtend(X, Ty); } // If the input value is a chrec scev, and we can prove that the value // did not overflow the old, smaller, value, we can zero extend all of the // operands (often constants). This allows analysis of something like // this: for (unsigned char X = 0; X < 100; ++X) { int Y = X; } if (const SCEVAddRecExpr *AR = dyn_cast(Op)) if (AR->isAffine()) { const SCEV *Start = AR->getStart(); const SCEV *Step = AR->getStepRecurrence(*this); unsigned BitWidth = getTypeSizeInBits(AR->getType()); const Loop *L = AR->getLoop(); if (!AR->hasNoUnsignedWrap()) { auto NewFlags = proveNoWrapViaConstantRanges(AR); const_cast(AR)->setNoWrapFlags(NewFlags); } // If we have special knowledge that this addrec won't overflow, // we don't need to do any further analysis. if (AR->hasNoUnsignedWrap()) return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are // simply not analyzable, and it covers the case where this code is // being called from within backedge-taken count analysis, such that // attempting to ask for the backedge-taken count would likely result // in infinite recursion. In the later case, the analysis code will // cope with a conservative value, and it will take care to purge // that value once it has finished. const SCEV *MaxBECount = getMaxBackedgeTakenCount(L); if (!isa(MaxBECount)) { // Manually compute the final value for AR, checking for // overflow. // Check whether the backedge-taken count can be losslessly casted to // the addrec's type. The count is always unsigned. const SCEV *CastedMaxBECount = getTruncateOrZeroExtend(MaxBECount, Start->getType()); const SCEV *RecastedMaxBECount = getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType()); if (MaxBECount == RecastedMaxBECount) { Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no unsigned overflow. const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step); const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul), WideTy); const SCEV *WideStart = getZeroExtendExpr(Start, WideTy); const SCEV *WideMaxBECount = getZeroExtendExpr(CastedMaxBECount, WideTy); const SCEV *OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getZeroExtendExpr(Step, WideTy))); if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NUW, which is propagated to this AddRec. const_cast(AR)->setNoWrapFlags(SCEV::FlagNUW); // Return the expression with the addrec on the outside. return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } // Similar to above, only this time treat the step value as signed. // This covers loops that count down. OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getSignExtendExpr(Step, WideTy))); if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NW, which is propagated to this AddRec. // Negative step causes unsigned wrap, but it still can't self-wrap. const_cast(AR)->setNoWrapFlags(SCEV::FlagNW); // Return the expression with the addrec on the outside. return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } } // Normally, in the cases we can prove no-overflow via a // backedge guarding condition, we can also compute a backedge // taken count for the loop. The exceptions are assumptions and // guards present in the loop -- SCEV is not great at exploiting // these to compute max backedge taken counts, but can still use // these to prove lack of overflow. Use this fact to avoid // doing extra work that may not pay off. if (!isa(MaxBECount) || HasGuards || !AC.assumptions().empty()) { // If the backedge is guarded by a comparison with the pre-inc // value the addrec is safe. Also, if the entry is guarded by // a comparison with the start value and the backedge is // guarded by a comparison with the post-inc value, the addrec // is safe. if (isKnownPositive(Step)) { const SCEV *N = getConstant(APInt::getMinValue(BitWidth) - getUnsignedRange(Step).getUnsignedMax()); if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) || (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) && isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR->getPostIncExpr(*this), N))) { // Cache knowledge of AR NUW, which is propagated to this // AddRec. const_cast(AR)->setNoWrapFlags(SCEV::FlagNUW); // Return the expression with the addrec on the outside. return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } else if (isKnownNegative(Step)) { const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) - getSignedRange(Step).getSignedMin()); if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) || (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) && isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR->getPostIncExpr(*this), N))) { // Cache knowledge of AR NW, which is propagated to this // AddRec. Negative step causes unsigned wrap, but it // still can't self-wrap. const_cast(AR)->setNoWrapFlags(SCEV::FlagNW); // Return the expression with the addrec on the outside. return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } } if (proveNoWrapByVaryingStart(Start, Step, L)) { const_cast(AR)->setNoWrapFlags(SCEV::FlagNUW); return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } if (auto *SA = dyn_cast(Op)) { // zext((A + B + ...)) --> (zext(A) + zext(B) + ...) if (SA->hasNoUnsignedWrap()) { // If the addition does not unsign overflow then we can, by definition, // commute the zero extension with the addition operation. SmallVector Ops; for (const auto *Op : SA->operands()) Ops.push_back(getZeroExtendExpr(Op, Ty)); return getAddExpr(Ops, SCEV::FlagNUW); } } // The cast wasn't folded; create an explicit cast node. // Recompute the insert position, as it may have been invalidated. if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); return S; } const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); Ty = getEffectiveSCEVType(Ty); // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast(Op)) return getConstant( cast(ConstantExpr::getSExt(SC->getValue(), Ty))); // sext(sext(x)) --> sext(x) if (const SCEVSignExtendExpr *SS = dyn_cast(Op)) return getSignExtendExpr(SS->getOperand(), Ty); // sext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast(Op)) return getZeroExtendExpr(SZ->getOperand(), Ty); // Before doing any expensive analysis, check to see if we've already // computed a SCEV for this Op and Ty. FoldingSetNodeID ID; ID.AddInteger(scSignExtend); ID.AddPointer(Op); ID.AddPointer(Ty); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; // sext(trunc(x)) --> sext(x) or x or trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast(Op)) { // It's possible the bits taken off by the truncate were all sign bits. If // so, we should be able to simplify this further. const SCEV *X = ST->getOperand(); ConstantRange CR = getSignedRange(X); unsigned TruncBits = getTypeSizeInBits(ST->getType()); unsigned NewBits = getTypeSizeInBits(Ty); if (CR.truncate(TruncBits).signExtend(NewBits).contains( CR.sextOrTrunc(NewBits))) return getTruncateOrSignExtend(X, Ty); } // sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2 if (auto *SA = dyn_cast(Op)) { if (SA->getNumOperands() == 2) { auto *SC1 = dyn_cast(SA->getOperand(0)); auto *SMul = dyn_cast(SA->getOperand(1)); if (SMul && SC1) { if (auto *SC2 = dyn_cast(SMul->getOperand(0))) { const APInt &C1 = SC1->getAPInt(); const APInt &C2 = SC2->getAPInt(); if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) && C2.isPowerOf2()) return getAddExpr(getSignExtendExpr(SC1, Ty), getSignExtendExpr(SMul, Ty)); } } } // sext((A + B + ...)) --> (sext(A) + sext(B) + ...) if (SA->hasNoSignedWrap()) { // If the addition does not sign overflow then we can, by definition, // commute the sign extension with the addition operation. SmallVector Ops; for (const auto *Op : SA->operands()) Ops.push_back(getSignExtendExpr(Op, Ty)); return getAddExpr(Ops, SCEV::FlagNSW); } } // If the input value is a chrec scev, and we can prove that the value // did not overflow the old, smaller, value, we can sign extend all of the // operands (often constants). This allows analysis of something like // this: for (signed char X = 0; X < 100; ++X) { int Y = X; } if (const SCEVAddRecExpr *AR = dyn_cast(Op)) if (AR->isAffine()) { const SCEV *Start = AR->getStart(); const SCEV *Step = AR->getStepRecurrence(*this); unsigned BitWidth = getTypeSizeInBits(AR->getType()); const Loop *L = AR->getLoop(); if (!AR->hasNoSignedWrap()) { auto NewFlags = proveNoWrapViaConstantRanges(AR); const_cast(AR)->setNoWrapFlags(NewFlags); } // If we have special knowledge that this addrec won't overflow, // we don't need to do any further analysis. if (AR->hasNoSignedWrap()) return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, SCEV::FlagNSW); // Check whether the backedge-taken count is SCEVCouldNotCompute. // Note that this serves two purposes: It filters out loops that are // simply not analyzable, and it covers the case where this code is // being called from within backedge-taken count analysis, such that // attempting to ask for the backedge-taken count would likely result // in infinite recursion. In the later case, the analysis code will // cope with a conservative value, and it will take care to purge // that value once it has finished. const SCEV *MaxBECount = getMaxBackedgeTakenCount(L); if (!isa(MaxBECount)) { // Manually compute the final value for AR, checking for // overflow. // Check whether the backedge-taken count can be losslessly casted to // the addrec's type. The count is always unsigned. const SCEV *CastedMaxBECount = getTruncateOrZeroExtend(MaxBECount, Start->getType()); const SCEV *RecastedMaxBECount = getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType()); if (MaxBECount == RecastedMaxBECount) { Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no signed overflow. const SCEV *SMul = getMulExpr(CastedMaxBECount, Step); const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul), WideTy); const SCEV *WideStart = getSignExtendExpr(Start, WideTy); const SCEV *WideMaxBECount = getZeroExtendExpr(CastedMaxBECount, WideTy); const SCEV *OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getSignExtendExpr(Step, WideTy))); if (SAdd == OperandExtendedAdd) { // Cache knowledge of AR NSW, which is propagated to this AddRec. const_cast(AR)->setNoWrapFlags(SCEV::FlagNSW); // Return the expression with the addrec on the outside. return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } // Similar to above, only this time treat the step value as unsigned. // This covers loops that count up with an unsigned step. OperandExtendedAdd = getAddExpr(WideStart, getMulExpr(WideMaxBECount, getZeroExtendExpr(Step, WideTy))); if (SAdd == OperandExtendedAdd) { // If AR wraps around then // // abs(Step) * MaxBECount > unsigned-max(AR->getType()) // => SAdd != OperandExtendedAdd // // Thus (AR is not NW => SAdd != OperandExtendedAdd) <=> // (SAdd == OperandExtendedAdd => AR is NW) const_cast(AR)->setNoWrapFlags(SCEV::FlagNW); // Return the expression with the addrec on the outside. return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } } // Normally, in the cases we can prove no-overflow via a // backedge guarding condition, we can also compute a backedge // taken count for the loop. The exceptions are assumptions and // guards present in the loop -- SCEV is not great at exploiting // these to compute max backedge taken counts, but can still use // these to prove lack of overflow. Use this fact to avoid // doing extra work that may not pay off. if (!isa(MaxBECount) || HasGuards || !AC.assumptions().empty()) { // If the backedge is guarded by a comparison with the pre-inc // value the addrec is safe. Also, if the entry is guarded by // a comparison with the start value and the backedge is // guarded by a comparison with the post-inc value, the addrec // is safe. ICmpInst::Predicate Pred; const SCEV *OverflowLimit = getSignedOverflowLimitForStep(Step, &Pred, this); if (OverflowLimit && (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) || (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) && isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this), OverflowLimit)))) { // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec. const_cast(AR)->setNoWrapFlags(SCEV::FlagNSW); return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } // If Start and Step are constants, check if we can apply this // transformation: // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2 auto *SC1 = dyn_cast(Start); auto *SC2 = dyn_cast(Step); if (SC1 && SC2) { const APInt &C1 = SC1->getAPInt(); const APInt &C2 = SC2->getAPInt(); if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) && C2.isPowerOf2()) { Start = getSignExtendExpr(Start, Ty); const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L, AR->getNoWrapFlags()); return getAddExpr(Start, getSignExtendExpr(NewAR, Ty)); } } if (proveNoWrapByVaryingStart(Start, Step, L)) { const_cast(AR)->setNoWrapFlags(SCEV::FlagNSW); return getAddRecExpr( getExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } } // If the input value is provably positive and we could not simplify // away the sext build a zext instead. if (isKnownNonNegative(Op)) return getZeroExtendExpr(Op, Ty); // The cast wasn't folded; create an explicit cast node. // Recompute the insert position, as it may have been invalidated. if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator), Op, Ty); UniqueSCEVs.InsertNode(S, IP); return S; } /// getAnyExtendExpr - Return a SCEV for the given operand extended with /// unspecified bits out to the given type. /// const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op, Type *Ty) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && "This is not a conversion to a SCEVable type!"); Ty = getEffectiveSCEVType(Ty); // Sign-extend negative constants. if (const SCEVConstant *SC = dyn_cast(Op)) if (SC->getAPInt().isNegative()) return getSignExtendExpr(Op, Ty); // Peel off a truncate cast. if (const SCEVTruncateExpr *T = dyn_cast(Op)) { const SCEV *NewOp = T->getOperand(); if (getTypeSizeInBits(NewOp->getType()) < getTypeSizeInBits(Ty)) return getAnyExtendExpr(NewOp, Ty); return getTruncateOrNoop(NewOp, Ty); } // Next try a zext cast. If the cast is folded, use it. const SCEV *ZExt = getZeroExtendExpr(Op, Ty); if (!isa(ZExt)) return ZExt; // Next try a sext cast. If the cast is folded, use it. const SCEV *SExt = getSignExtendExpr(Op, Ty); if (!isa(SExt)) return SExt; // Force the cast to be folded into the operands of an addrec. if (const SCEVAddRecExpr *AR = dyn_cast(Op)) { SmallVector Ops; for (const SCEV *Op : AR->operands()) Ops.push_back(getAnyExtendExpr(Op, Ty)); return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW); } // If the expression is obviously signed, use the sext cast value. if (isa(Op)) return SExt; // Absent any other information, use the zext cast value. return ZExt; } /// Process the given Ops list, which is a list of operands to be added under /// the given scale, update the given map. This is a helper function for /// getAddRecExpr. As an example of what it does, given a sequence of operands /// that would form an add expression like this: /// /// m + n + 13 + (A * (o + p + (B * (q + m + 29)))) + r + (-1 * r) /// /// where A and B are constants, update the map with these values: /// /// (m, 1+A*B), (n, 1), (o, A), (p, A), (q, A*B), (r, 0) /// /// and add 13 + A*B*29 to AccumulatedConstant. /// This will allow getAddRecExpr to produce this: /// /// 13+A*B*29 + n + (m * (1+A*B)) + ((o + p) * A) + (q * A*B) /// /// This form often exposes folding opportunities that are hidden in /// the original operand list. /// /// Return true iff it appears that any interesting folding opportunities /// may be exposed. This helps getAddRecExpr short-circuit extra work in /// the common case where no interesting opportunities are present, and /// is also used as a check to avoid infinite recursion. /// static bool CollectAddOperandsWithScales(DenseMap &M, SmallVectorImpl &NewOps, APInt &AccumulatedConstant, const SCEV *const *Ops, size_t NumOperands, const APInt &Scale, ScalarEvolution &SE) { bool Interesting = false; // Iterate over the add operands. They are sorted, with constants first. unsigned i = 0; while (const SCEVConstant *C = dyn_cast(Ops[i])) { ++i; // Pull a buried constant out to the outside. if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero()) Interesting = true; AccumulatedConstant += Scale * C->getAPInt(); } // Next comes everything else. We're especially interested in multiplies // here, but they're in the middle, so just visit the rest with one loop. for (; i != NumOperands; ++i) { const SCEVMulExpr *Mul = dyn_cast(Ops[i]); if (Mul && isa(Mul->getOperand(0))) { APInt NewScale = Scale * cast(Mul->getOperand(0))->getAPInt(); if (Mul->getNumOperands() == 2 && isa(Mul->getOperand(1))) { // A multiplication of a constant with another add; recurse. const SCEVAddExpr *Add = cast(Mul->getOperand(1)); Interesting |= CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant, Add->op_begin(), Add->getNumOperands(), NewScale, SE); } else { // A multiplication of a constant with some other value. Update // the map. SmallVector MulOps(Mul->op_begin()+1, Mul->op_end()); const SCEV *Key = SE.getMulExpr(MulOps); auto Pair = M.insert({Key, NewScale}); if (Pair.second) { NewOps.push_back(Pair.first->first); } else { Pair.first->second += NewScale; // The map already had an entry for this value, which may indicate // a folding opportunity. Interesting = true; } } } else { // An ordinary operand. Update the map. std::pair::iterator, bool> Pair = M.insert({Ops[i], Scale}); if (Pair.second) { NewOps.push_back(Pair.first->first); } else { Pair.first->second += Scale; // The map already had an entry for this value, which may indicate // a folding opportunity. Interesting = true; } } } return Interesting; } // We're trying to construct a SCEV of type `Type' with `Ops' as operands and // `OldFlags' as can't-wrap behavior. Infer a more aggressive set of // can't-overflow flags for the operation if possible. static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, const SmallVectorImpl &Ops, SCEV::NoWrapFlags Flags) { using namespace std::placeholders; typedef OverflowingBinaryOperator OBO; bool CanAnalyze = Type == scAddExpr || Type == scAddRecExpr || Type == scMulExpr; (void)CanAnalyze; assert(CanAnalyze && "don't call from other places!"); int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW; SCEV::NoWrapFlags SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask); // If FlagNSW is true and all the operands are non-negative, infer FlagNUW. auto IsKnownNonNegative = [&](const SCEV *S) { return SE->isKnownNonNegative(S); }; if (SignOrUnsignWrap == SCEV::FlagNSW && all_of(Ops, IsKnownNonNegative)) Flags = ScalarEvolution::setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask); SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask); if (SignOrUnsignWrap != SignOrUnsignMask && Type == scAddExpr && Ops.size() == 2 && isa(Ops[0])) { // (A + C) --> (A + C) if the addition does not sign overflow // (A + C) --> (A + C) if the addition does not unsign overflow const APInt &C = cast(Ops[0])->getAPInt(); if (!(SignOrUnsignWrap & SCEV::FlagNSW)) { auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, C, OBO::NoSignedWrap); if (NSWRegion.contains(SE->getSignedRange(Ops[1]))) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); } if (!(SignOrUnsignWrap & SCEV::FlagNUW)) { auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, C, OBO::NoUnsignedWrap); if (NUWRegion.contains(SE->getUnsignedRange(Ops[1]))) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); } } return Flags; } /// Get a canonical add expression, or something simpler if possible. const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags Flags) { assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) && "only nuw or nsw allowed"); assert(!Ops.empty() && "Cannot get empty add!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "SCEVAddExpr operand types don't match!"); #endif // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI); Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags); // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { ++Idx; assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! Ops[0] = getConstant(LHSC->getAPInt() + RHSC->getAPInt()); if (Ops.size() == 2) return Ops[0]; Ops.erase(Ops.begin()+1); // Erase the folded element LHSC = cast(Ops[0]); } // If we are left with a constant zero being added, strip it off. if (LHSC->getValue()->isZero()) { Ops.erase(Ops.begin()); --Idx; } if (Ops.size() == 1) return Ops[0]; } // Okay, check to see if the same value occurs in the operand list more than // once. If so, merge them together into an multiply expression. Since we // sorted the list, these values are required to be adjacent. Type *Ty = Ops[0]->getType(); bool FoundMatch = false; for (unsigned i = 0, e = Ops.size(); i != e-1; ++i) if (Ops[i] == Ops[i+1]) { // X + Y + Y --> X + Y*2 // Scan ahead to count how many equal operands there are. unsigned Count = 2; while (i+Count != e && Ops[i+Count] == Ops[i]) ++Count; // Merge the values into a multiply. const SCEV *Scale = getConstant(Ty, Count); const SCEV *Mul = getMulExpr(Scale, Ops[i]); if (Ops.size() == Count) return Mul; Ops[i] = Mul; Ops.erase(Ops.begin()+i+1, Ops.begin()+i+Count); --i; e -= Count - 1; FoundMatch = true; } if (FoundMatch) return getAddExpr(Ops, Flags); // Check for truncates. If all the operands are truncated from the same // type, see if factoring out the truncate would permit the result to be // folded. eg., trunc(x) + m*trunc(n) --> trunc(x + trunc(m)*n) // if the contents of the resulting outer trunc fold to something simple. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { const SCEVTruncateExpr *Trunc = cast(Ops[Idx]); Type *DstType = Trunc->getType(); Type *SrcType = Trunc->getOperand()->getType(); SmallVector LargeOps; bool Ok = true; // Check all the operands to see if they can be represented in the // source type of the truncate. for (unsigned i = 0, e = Ops.size(); i != e; ++i) { if (const SCEVTruncateExpr *T = dyn_cast(Ops[i])) { if (T->getOperand()->getType() != SrcType) { Ok = false; break; } LargeOps.push_back(T->getOperand()); } else if (const SCEVConstant *C = dyn_cast(Ops[i])) { LargeOps.push_back(getAnyExtendExpr(C, SrcType)); } else if (const SCEVMulExpr *M = dyn_cast(Ops[i])) { SmallVector LargeMulOps; for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) { if (const SCEVTruncateExpr *T = dyn_cast(M->getOperand(j))) { if (T->getOperand()->getType() != SrcType) { Ok = false; break; } LargeMulOps.push_back(T->getOperand()); } else if (const auto *C = dyn_cast(M->getOperand(j))) { LargeMulOps.push_back(getAnyExtendExpr(C, SrcType)); } else { Ok = false; break; } } if (Ok) LargeOps.push_back(getMulExpr(LargeMulOps)); } else { Ok = false; break; } } if (Ok) { // Evaluate the expression in the larger type. const SCEV *Fold = getAddExpr(LargeOps, Flags); // If it folds to something simple, use it. Otherwise, don't. if (isa(Fold) || isa(Fold)) return getTruncateExpr(Fold, DstType); } } // Skip past any other cast SCEVs. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr) ++Idx; // If there are add operands they would be next. if (Idx < Ops.size()) { bool DeletedAdd = false; while (const SCEVAddExpr *Add = dyn_cast(Ops[Idx])) { // If we have an add, expand the add operands onto the end of the operands // list. Ops.erase(Ops.begin()+Idx); Ops.append(Add->op_begin(), Add->op_end()); DeletedAdd = true; } // If we deleted at least one add, we added operands to the end of the list, // and they are not necessarily sorted. Recurse to resort and resimplify // any operands we just acquired. if (DeletedAdd) return getAddExpr(Ops); } // Skip over the add expression until we get to a multiply. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr) ++Idx; // Check to see if there are any folding opportunities present with // operands multiplied by constant values. if (Idx < Ops.size() && isa(Ops[Idx])) { uint64_t BitWidth = getTypeSizeInBits(Ty); DenseMap M; SmallVector NewOps; APInt AccumulatedConstant(BitWidth, 0); if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant, Ops.data(), Ops.size(), APInt(BitWidth, 1), *this)) { struct APIntCompare { bool operator()(const APInt &LHS, const APInt &RHS) const { return LHS.ult(RHS); } }; // Some interesting folding opportunity is present, so its worthwhile to // re-generate the operands list. Group the operands by constant scale, // to avoid multiplying by the same constant scale multiple times. std::map, APIntCompare> MulOpLists; for (const SCEV *NewOp : NewOps) MulOpLists[M.find(NewOp)->second].push_back(NewOp); // Re-generate the operands list. Ops.clear(); if (AccumulatedConstant != 0) Ops.push_back(getConstant(AccumulatedConstant)); for (auto &MulOp : MulOpLists) if (MulOp.first != 0) Ops.push_back(getMulExpr(getConstant(MulOp.first), getAddExpr(MulOp.second))); if (Ops.empty()) return getZero(Ty); if (Ops.size() == 1) return Ops[0]; return getAddExpr(Ops); } } // If we are adding something to a multiply expression, make sure the // something is not already an operand of the multiply. If so, merge it into // the multiply. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { const SCEVMulExpr *Mul = cast(Ops[Idx]); for (unsigned MulOp = 0, e = Mul->getNumOperands(); MulOp != e; ++MulOp) { const SCEV *MulOpSCEV = Mul->getOperand(MulOp); if (isa(MulOpSCEV)) continue; for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp) if (MulOpSCEV == Ops[AddOp]) { // Fold W + X + (X * Y * Z) --> W + (X * ((Y*Z)+1)) const SCEV *InnerMul = Mul->getOperand(MulOp == 0); if (Mul->getNumOperands() != 2) { // If the multiply has more than two operands, we must get the // Y*Z term. SmallVector MulOps(Mul->op_begin(), Mul->op_begin()+MulOp); MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end()); InnerMul = getMulExpr(MulOps); } const SCEV *One = getOne(Ty); const SCEV *AddOne = getAddExpr(One, InnerMul); const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV); if (Ops.size() == 2) return OuterMul; if (AddOp < Idx) { Ops.erase(Ops.begin()+AddOp); Ops.erase(Ops.begin()+Idx-1); } else { Ops.erase(Ops.begin()+Idx); Ops.erase(Ops.begin()+AddOp-1); } Ops.push_back(OuterMul); return getAddExpr(Ops); } // Check this multiply against other multiplies being added together. for (unsigned OtherMulIdx = Idx+1; OtherMulIdx < Ops.size() && isa(Ops[OtherMulIdx]); ++OtherMulIdx) { const SCEVMulExpr *OtherMul = cast(Ops[OtherMulIdx]); // If MulOp occurs in OtherMul, we can fold the two multiplies // together. for (unsigned OMulOp = 0, e = OtherMul->getNumOperands(); OMulOp != e; ++OMulOp) if (OtherMul->getOperand(OMulOp) == MulOpSCEV) { // Fold X + (A*B*C) + (A*D*E) --> X + (A*(B*C+D*E)) const SCEV *InnerMul1 = Mul->getOperand(MulOp == 0); if (Mul->getNumOperands() != 2) { SmallVector MulOps(Mul->op_begin(), Mul->op_begin()+MulOp); MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end()); InnerMul1 = getMulExpr(MulOps); } const SCEV *InnerMul2 = OtherMul->getOperand(OMulOp == 0); if (OtherMul->getNumOperands() != 2) { SmallVector MulOps(OtherMul->op_begin(), OtherMul->op_begin()+OMulOp); MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end()); InnerMul2 = getMulExpr(MulOps); } const SCEV *InnerMulSum = getAddExpr(InnerMul1,InnerMul2); const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum); if (Ops.size() == 2) return OuterMul; Ops.erase(Ops.begin()+Idx); Ops.erase(Ops.begin()+OtherMulIdx-1); Ops.push_back(OuterMul); return getAddExpr(Ops); } } } } // If there are any add recurrences in the operands list, see if any other // added values are loop invariant. If so, we can fold them into the // recurrence. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr) ++Idx; // Scan over all recurrences, trying to fold loop invariants into them. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { // Scan all of the other operands to this add and add them to the vector if // they are loop invariant w.r.t. the recurrence. SmallVector LIOps; const SCEVAddRecExpr *AddRec = cast(Ops[Idx]); const Loop *AddRecLoop = AddRec->getLoop(); for (unsigned i = 0, e = Ops.size(); i != e; ++i) if (isLoopInvariant(Ops[i], AddRecLoop)) { LIOps.push_back(Ops[i]); Ops.erase(Ops.begin()+i); --i; --e; } // If we found some loop invariants, fold them into the recurrence. if (!LIOps.empty()) { // NLI + LI + {Start,+,Step} --> NLI + {LI+Start,+,Step} LIOps.push_back(AddRec->getStart()); SmallVector AddRecOps(AddRec->op_begin(), AddRec->op_end()); // This follows from the fact that the no-wrap flags on the outer add // expression are applicable on the 0th iteration, when the add recurrence // will be equal to its start value. AddRecOps[0] = getAddExpr(LIOps, Flags); // Build the new addrec. Propagate the NUW and NSW flags if both the // outer add and the inner addrec are guaranteed to have no overflow. // Always propagate NW. Flags = AddRec->getNoWrapFlags(setFlags(Flags, SCEV::FlagNW)); const SCEV *NewRec = getAddRecExpr(AddRecOps, AddRecLoop, Flags); // If all of the other operands were loop invariant, we are done. if (Ops.size() == 1) return NewRec; // Otherwise, add the folded AddRec by the non-invariant parts. for (unsigned i = 0;; ++i) if (Ops[i] == AddRec) { Ops[i] = NewRec; break; } return getAddExpr(Ops); } // Okay, if there weren't any loop invariants to be folded, check to see if // there are multiple AddRec's with the same loop induction variable being // added together. If so, we can fold them. for (unsigned OtherIdx = Idx+1; OtherIdx < Ops.size() && isa(Ops[OtherIdx]); ++OtherIdx) if (AddRecLoop == cast(Ops[OtherIdx])->getLoop()) { // Other + {A,+,B} + {C,+,D} --> Other + {A+C,+,B+D} SmallVector AddRecOps(AddRec->op_begin(), AddRec->op_end()); for (; OtherIdx != Ops.size() && isa(Ops[OtherIdx]); ++OtherIdx) if (const auto *OtherAddRec = dyn_cast(Ops[OtherIdx])) if (OtherAddRec->getLoop() == AddRecLoop) { for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) { if (i >= AddRecOps.size()) { AddRecOps.append(OtherAddRec->op_begin()+i, OtherAddRec->op_end()); break; } AddRecOps[i] = getAddExpr(AddRecOps[i], OtherAddRec->getOperand(i)); } Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; } // Step size has changed, so we cannot guarantee no self-wraparound. Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap); return getAddExpr(Ops); } // Otherwise couldn't fold anything into this recurrence. Move onto the // next one. } // Okay, it looks like we really DO need an add expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(scAddExpr); for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = nullptr; SCEVAddExpr *S = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); if (!S) { const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); S = new (SCEVAllocator) SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); } S->setNoWrapFlags(Flags); return S; } static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) { uint64_t k = i*j; if (j > 1 && k / j != i) Overflow = true; return k; } /// Compute the result of "n choose k", the binomial coefficient. If an /// intermediate computation overflows, Overflow will be set and the return will /// be garbage. Overflow is not cleared on absence of overflow. static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) { // We use the multiplicative formula: // n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 . // At each iteration, we take the n-th term of the numeral and divide by the // (k-n)th term of the denominator. This division will always produce an // integral result, and helps reduce the chance of overflow in the // intermediate computations. However, we can still overflow even when the // final result would fit. if (n == 0 || n == k) return 1; if (k > n) return 0; if (k > n/2) k = n-k; uint64_t r = 1; for (uint64_t i = 1; i <= k; ++i) { r = umul_ov(r, n-(i-1), Overflow); r /= i; } return r; } /// Determine if any of the operands in this SCEV are a constant or if /// any of the add or multiply expressions in this SCEV contain a constant. static bool containsConstantSomewhere(const SCEV *StartExpr) { SmallVector Ops; Ops.push_back(StartExpr); while (!Ops.empty()) { const SCEV *CurrentExpr = Ops.pop_back_val(); if (isa(*CurrentExpr)) return true; if (isa(*CurrentExpr) || isa(*CurrentExpr)) { const auto *CurrentNAry = cast(CurrentExpr); Ops.append(CurrentNAry->op_begin(), CurrentNAry->op_end()); } } return false; } /// Get a canonical multiply expression, or something simpler if possible. const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags Flags) { assert(Flags == maskFlags(Flags, SCEV::FlagNUW | SCEV::FlagNSW) && "only nuw or nsw allowed"); assert(!Ops.empty() && "Cannot get empty mul!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "SCEVMulExpr operand types don't match!"); #endif // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI); Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags); // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { // C1*(C2+V) -> C1*C2 + C1*V if (Ops.size() == 2) if (const SCEVAddExpr *Add = dyn_cast(Ops[1])) // If any of Add's ops are Adds or Muls with a constant, // apply this transformation as well. if (Add->getNumOperands() == 2) if (containsConstantSomewhere(Add)) return getAddExpr(getMulExpr(LHSC, Add->getOperand(0)), getMulExpr(LHSC, Add->getOperand(1))); ++Idx; while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! ConstantInt *Fold = ConstantInt::get(getContext(), LHSC->getAPInt() * RHSC->getAPInt()); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; LHSC = cast(Ops[0]); } // If we are left with a constant one being multiplied, strip it off. if (cast(Ops[0])->getValue()->equalsInt(1)) { Ops.erase(Ops.begin()); --Idx; } else if (cast(Ops[0])->getValue()->isZero()) { // If we have a multiply of zero, it will always be zero. return Ops[0]; } else if (Ops[0]->isAllOnesValue()) { // If we have a mul by -1 of an add, try distributing the -1 among the // add operands. if (Ops.size() == 2) { if (const SCEVAddExpr *Add = dyn_cast(Ops[1])) { SmallVector NewOps; bool AnyFolded = false; for (const SCEV *AddOp : Add->operands()) { const SCEV *Mul = getMulExpr(Ops[0], AddOp); if (!isa(Mul)) AnyFolded = true; NewOps.push_back(Mul); } if (AnyFolded) return getAddExpr(NewOps); } else if (const auto *AddRec = dyn_cast(Ops[1])) { // Negation preserves a recurrence's no self-wrap property. SmallVector Operands; for (const SCEV *AddRecOp : AddRec->operands()) Operands.push_back(getMulExpr(Ops[0], AddRecOp)); return getAddRecExpr(Operands, AddRec->getLoop(), AddRec->getNoWrapFlags(SCEV::FlagNW)); } } } if (Ops.size() == 1) return Ops[0]; } // Skip over the add expression until we get to a multiply. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr) ++Idx; // If there are mul operands inline them all into this expression. if (Idx < Ops.size()) { bool DeletedMul = false; while (const SCEVMulExpr *Mul = dyn_cast(Ops[Idx])) { // If we have an mul, expand the mul operands onto the end of the operands // list. Ops.erase(Ops.begin()+Idx); Ops.append(Mul->op_begin(), Mul->op_end()); DeletedMul = true; } // If we deleted at least one mul, we added operands to the end of the list, // and they are not necessarily sorted. Recurse to resort and resimplify // any operands we just acquired. if (DeletedMul) return getMulExpr(Ops); } // If there are any add recurrences in the operands list, see if any other // added values are loop invariant. If so, we can fold them into the // recurrence. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr) ++Idx; // Scan over all recurrences, trying to fold loop invariants into them. for (; Idx < Ops.size() && isa(Ops[Idx]); ++Idx) { // Scan all of the other operands to this mul and add them to the vector if // they are loop invariant w.r.t. the recurrence. SmallVector LIOps; const SCEVAddRecExpr *AddRec = cast(Ops[Idx]); const Loop *AddRecLoop = AddRec->getLoop(); for (unsigned i = 0, e = Ops.size(); i != e; ++i) if (isLoopInvariant(Ops[i], AddRecLoop)) { LIOps.push_back(Ops[i]); Ops.erase(Ops.begin()+i); --i; --e; } // If we found some loop invariants, fold them into the recurrence. if (!LIOps.empty()) { // NLI * LI * {Start,+,Step} --> NLI * {LI*Start,+,LI*Step} SmallVector NewOps; NewOps.reserve(AddRec->getNumOperands()); const SCEV *Scale = getMulExpr(LIOps); for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i))); // Build the new addrec. Propagate the NUW and NSW flags if both the // outer mul and the inner addrec are guaranteed to have no overflow. // // No self-wrap cannot be guaranteed after changing the step size, but // will be inferred if either NUW or NSW is true. Flags = AddRec->getNoWrapFlags(clearFlags(Flags, SCEV::FlagNW)); const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop, Flags); // If all of the other operands were loop invariant, we are done. if (Ops.size() == 1) return NewRec; // Otherwise, multiply the folded AddRec by the non-invariant parts. for (unsigned i = 0;; ++i) if (Ops[i] == AddRec) { Ops[i] = NewRec; break; } return getMulExpr(Ops); } // Okay, if there weren't any loop invariants to be folded, check to see if // there are multiple AddRec's with the same loop induction variable being // multiplied together. If so, we can fold them. // {A1,+,A2,+,...,+,An} * {B1,+,B2,+,...,+,Bn} // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [ // choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z // ]]],+,...up to x=2n}. // Note that the arguments to choose() are always integers with values // known at compile time, never SCEV objects. // // The implementation avoids pointless extra computations when the two // addrec's are of different length (mathematically, it's equivalent to // an infinite stream of zeros on the right). bool OpsModified = false; for (unsigned OtherIdx = Idx+1; OtherIdx != Ops.size() && isa(Ops[OtherIdx]); ++OtherIdx) { const SCEVAddRecExpr *OtherAddRec = dyn_cast(Ops[OtherIdx]); if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop) continue; bool Overflow = false; Type *Ty = AddRec->getType(); bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64; SmallVector AddRecOps; for (int x = 0, xe = AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) { const SCEV *Term = getZero(Ty); for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) { uint64_t Coeff1 = Choose(x, 2*x - y, Overflow); for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1), ze = std::min(x+1, (int)OtherAddRec->getNumOperands()); z < ze && !Overflow; ++z) { uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow); uint64_t Coeff; if (LargerThan64Bits) Coeff = umul_ov(Coeff1, Coeff2, Overflow); else Coeff = Coeff1*Coeff2; const SCEV *CoeffTerm = getConstant(Ty, Coeff); const SCEV *Term1 = AddRec->getOperand(y-z); const SCEV *Term2 = OtherAddRec->getOperand(z); Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2)); } } AddRecOps.push_back(Term); } if (!Overflow) { const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(), SCEV::FlagAnyWrap); if (Ops.size() == 2) return NewAddRec; Ops[Idx] = NewAddRec; Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; OpsModified = true; AddRec = dyn_cast(NewAddRec); if (!AddRec) break; } } if (OpsModified) return getMulExpr(Ops); // Otherwise couldn't fold anything into this recurrence. Move onto the // next one. } // Okay, it looks like we really DO need an mul expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(scMulExpr); for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = nullptr; SCEVMulExpr *S = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); if (!S) { const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator), O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); } S->setNoWrapFlags(Flags); return S; } /// Get a canonical unsigned division expression, or something simpler if /// possible. const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, const SCEV *RHS) { assert(getEffectiveSCEVType(LHS->getType()) == getEffectiveSCEVType(RHS->getType()) && "SCEVUDivExpr operand types don't match!"); if (const SCEVConstant *RHSC = dyn_cast(RHS)) { if (RHSC->getValue()->equalsInt(1)) return LHS; // X udiv 1 --> x // If the denominator is zero, the result of the udiv is undefined. Don't // try to analyze it, because the resolution chosen here may differ from // the resolution chosen in other parts of the compiler. if (!RHSC->getValue()->isZero()) { // Determine if the division can be folded into the operands of // its operands. // TODO: Generalize this to non-constants by using known-bits information. Type *Ty = LHS->getType(); unsigned LZ = RHSC->getAPInt().countLeadingZeros(); unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1; // For non-power-of-two values, effectively round the value up to the // nearest power of two. if (!RHSC->getAPInt().isPowerOf2()) ++MaxShiftAmt; IntegerType *ExtTy = IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt); if (const SCEVAddRecExpr *AR = dyn_cast(LHS)) if (const SCEVConstant *Step = dyn_cast(AR->getStepRecurrence(*this))) { // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded. const APInt &StepInt = Step->getAPInt(); const APInt &DivInt = RHSC->getAPInt(); if (!StepInt.urem(DivInt) && getZeroExtendExpr(AR, ExtTy) == getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), getZeroExtendExpr(Step, ExtTy), AR->getLoop(), SCEV::FlagAnyWrap)) { SmallVector Operands; for (const SCEV *Op : AR->operands()) Operands.push_back(getUDivExpr(Op, RHS)); return getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagNW); } /// Get a canonical UDivExpr for a recurrence. /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0. // We can currently only fold X%N if X is constant. const SCEVConstant *StartC = dyn_cast(AR->getStart()); if (StartC && !DivInt.urem(StepInt) && getZeroExtendExpr(AR, ExtTy) == getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), getZeroExtendExpr(Step, ExtTy), AR->getLoop(), SCEV::FlagAnyWrap)) { const APInt &StartInt = StartC->getAPInt(); const APInt &StartRem = StartInt.urem(StepInt); if (StartRem != 0) LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step, AR->getLoop(), SCEV::FlagNW); } } // (A*B)/C --> A*(B/C) if safe and B/C can be folded. if (const SCEVMulExpr *M = dyn_cast(LHS)) { SmallVector Operands; for (const SCEV *Op : M->operands()) Operands.push_back(getZeroExtendExpr(Op, ExtTy)); if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands)) // Find an operand that's safely divisible. for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) { const SCEV *Op = M->getOperand(i); const SCEV *Div = getUDivExpr(Op, RHSC); if (!isa(Div) && getMulExpr(Div, RHSC) == Op) { Operands = SmallVector(M->op_begin(), M->op_end()); Operands[i] = Div; return getMulExpr(Operands); } } } // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded. if (const SCEVAddExpr *A = dyn_cast(LHS)) { SmallVector Operands; for (const SCEV *Op : A->operands()) Operands.push_back(getZeroExtendExpr(Op, ExtTy)); if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) { Operands.clear(); for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) { const SCEV *Op = getUDivExpr(A->getOperand(i), RHS); if (isa(Op) || getMulExpr(Op, RHS) != A->getOperand(i)) break; Operands.push_back(Op); } if (Operands.size() == A->getNumOperands()) return getAddExpr(Operands); } } // Fold if both operands are constant. if (const SCEVConstant *LHSC = dyn_cast(LHS)) { Constant *LHSCV = LHSC->getValue(); Constant *RHSCV = RHSC->getValue(); return getConstant(cast(ConstantExpr::getUDiv(LHSCV, RHSCV))); } } } FoldingSetNodeID ID; ID.AddInteger(scUDivExpr); ID.AddPointer(LHS); ID.AddPointer(RHS); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator), LHS, RHS); UniqueSCEVs.InsertNode(S, IP); return S; } static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) { APInt A = C1->getAPInt().abs(); APInt B = C2->getAPInt().abs(); uint32_t ABW = A.getBitWidth(); uint32_t BBW = B.getBitWidth(); if (ABW > BBW) B = B.zext(ABW); else if (ABW < BBW) A = A.zext(BBW); return APIntOps::GreatestCommonDivisor(A, B); } /// Get a canonical unsigned division expression, or something simpler if /// possible. There is no representation for an exact udiv in SCEV IR, but we /// can attempt to remove factors from the LHS and RHS. We can't do this when /// it's not exact because the udiv may be clearing bits. const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS, const SCEV *RHS) { // TODO: we could try to find factors in all sorts of things, but for now we // just deal with u/exact (multiply, constant). See SCEVDivision towards the // end of this file for inspiration. const SCEVMulExpr *Mul = dyn_cast(LHS); if (!Mul) return getUDivExpr(LHS, RHS); if (const SCEVConstant *RHSCst = dyn_cast(RHS)) { // If the mulexpr multiplies by a constant, then that constant must be the // first element of the mulexpr. if (const auto *LHSCst = dyn_cast(Mul->getOperand(0))) { if (LHSCst == RHSCst) { SmallVector Operands; Operands.append(Mul->op_begin() + 1, Mul->op_end()); return getMulExpr(Operands); } // We can't just assume that LHSCst divides RHSCst cleanly, it could be // that there's a factor provided by one of the other terms. We need to // check. APInt Factor = gcd(LHSCst, RHSCst); if (!Factor.isIntN(1)) { LHSCst = cast(getConstant(LHSCst->getAPInt().udiv(Factor))); RHSCst = cast(getConstant(RHSCst->getAPInt().udiv(Factor))); SmallVector Operands; Operands.push_back(LHSCst); Operands.append(Mul->op_begin() + 1, Mul->op_end()); LHS = getMulExpr(Operands); RHS = RHSCst; Mul = dyn_cast(LHS); if (!Mul) return getUDivExactExpr(LHS, RHS); } } } for (int i = 0, e = Mul->getNumOperands(); i != e; ++i) { if (Mul->getOperand(i) == RHS) { SmallVector Operands; Operands.append(Mul->op_begin(), Mul->op_begin() + i); Operands.append(Mul->op_begin() + i + 1, Mul->op_end()); return getMulExpr(Operands); } } return getUDivExpr(LHS, RHS); } /// Get an add recurrence expression for the specified loop. Simplify the /// expression as much as possible. const SCEV *ScalarEvolution::getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags) { SmallVector Operands; Operands.push_back(Start); if (const SCEVAddRecExpr *StepChrec = dyn_cast(Step)) if (StepChrec->getLoop() == L) { Operands.append(StepChrec->op_begin(), StepChrec->op_end()); return getAddRecExpr(Operands, L, maskFlags(Flags, SCEV::FlagNW)); } Operands.push_back(Step); return getAddRecExpr(Operands, L, Flags); } /// Get an add recurrence expression for the specified loop. Simplify the /// expression as much as possible. const SCEV * ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, const Loop *L, SCEV::NoWrapFlags Flags) { if (Operands.size() == 1) return Operands[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Operands[0]->getType()); for (unsigned i = 1, e = Operands.size(); i != e; ++i) assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy && "SCEVAddRecExpr operand types don't match!"); for (unsigned i = 0, e = Operands.size(); i != e; ++i) assert(isLoopInvariant(Operands[i], L) && "SCEVAddRecExpr operand is not loop-invariant!"); #endif if (Operands.back()->isZero()) { Operands.pop_back(); return getAddRecExpr(Operands, L, SCEV::FlagAnyWrap); // {X,+,0} --> X } // It's tempting to want to call getMaxBackedgeTakenCount count here and // use that information to infer NUW and NSW flags. However, computing a // BE count requires calling getAddRecExpr, so we may not yet have a // meaningful BE count at this point (and if we don't, we'd be stuck // with a SCEVCouldNotCompute as the cached BE count). Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags); // Canonicalize nested AddRecs in by nesting them in order of loop depth. if (const SCEVAddRecExpr *NestedAR = dyn_cast(Operands[0])) { const Loop *NestedLoop = NestedAR->getLoop(); if (L->contains(NestedLoop) ? (L->getLoopDepth() < NestedLoop->getLoopDepth()) : (!NestedLoop->contains(L) && DT.dominates(L->getHeader(), NestedLoop->getHeader()))) { SmallVector NestedOperands(NestedAR->op_begin(), NestedAR->op_end()); Operands[0] = NestedAR->getStart(); // AddRecs require their operands be loop-invariant with respect to their // loops. Don't perform this transformation if it would break this // requirement. bool AllInvariant = all_of( Operands, [&](const SCEV *Op) { return isLoopInvariant(Op, L); }); if (AllInvariant) { // Create a recurrence for the outer loop with the same step size. // // The outer recurrence keeps its NW flag but only keeps NUW/NSW if the // inner recurrence has the same property. SCEV::NoWrapFlags OuterFlags = maskFlags(Flags, SCEV::FlagNW | NestedAR->getNoWrapFlags()); NestedOperands[0] = getAddRecExpr(Operands, L, OuterFlags); AllInvariant = all_of(NestedOperands, [&](const SCEV *Op) { return isLoopInvariant(Op, NestedLoop); }); if (AllInvariant) { // Ok, both add recurrences are valid after the transformation. // // The inner recurrence keeps its NW flag but only keeps NUW/NSW if // the outer recurrence has the same property. SCEV::NoWrapFlags InnerFlags = maskFlags(NestedAR->getNoWrapFlags(), SCEV::FlagNW | Flags); return getAddRecExpr(NestedOperands, NestedLoop, InnerFlags); } } // Reset Operands to its original state. Operands[0] = NestedAR; } } // Okay, it looks like we really DO need an addrec expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(scAddRecExpr); for (unsigned i = 0, e = Operands.size(); i != e; ++i) ID.AddPointer(Operands[i]); ID.AddPointer(L); void *IP = nullptr; SCEVAddRecExpr *S = static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); if (!S) { const SCEV **O = SCEVAllocator.Allocate(Operands.size()); std::uninitialized_copy(Operands.begin(), Operands.end(), O); S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator), O, Operands.size(), L); UniqueSCEVs.InsertNode(S, IP); } S->setNoWrapFlags(Flags); return S; } const SCEV * ScalarEvolution::getGEPExpr(Type *PointeeType, const SCEV *BaseExpr, const SmallVectorImpl &IndexExprs, bool InBounds) { // getSCEV(Base)->getType() has the same address space as Base->getType() // because SCEV::getType() preserves the address space. Type *IntPtrTy = getEffectiveSCEVType(BaseExpr->getType()); // FIXME(PR23527): Don't blindly transfer the inbounds flag from the GEP // instruction to its SCEV, because the Instruction may be guarded by control // flow and the no-overflow bits may not be valid for the expression in any // context. This can be fixed similarly to how these flags are handled for // adds. SCEV::NoWrapFlags Wrap = InBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap; const SCEV *TotalOffset = getZero(IntPtrTy); // The address space is unimportant. The first thing we do on CurTy is getting // its element type. Type *CurTy = PointerType::getUnqual(PointeeType); for (const SCEV *IndexExpr : IndexExprs) { // Compute the (potentially symbolic) offset in bytes for this index. if (StructType *STy = dyn_cast(CurTy)) { // For a struct, add the member offset. ConstantInt *Index = cast(IndexExpr)->getValue(); unsigned FieldNo = Index->getZExtValue(); const SCEV *FieldOffset = getOffsetOfExpr(IntPtrTy, STy, FieldNo); // Add the field offset to the running total offset. TotalOffset = getAddExpr(TotalOffset, FieldOffset); // Update CurTy to the type of the field at Index. CurTy = STy->getTypeAtIndex(Index); } else { // Update CurTy to its element type. CurTy = cast(CurTy)->getElementType(); // For an array, add the element offset, explicitly scaled. const SCEV *ElementSize = getSizeOfExpr(IntPtrTy, CurTy); // Getelementptr indices are signed. IndexExpr = getTruncateOrSignExtend(IndexExpr, IntPtrTy); // Multiply the index by the element size to compute the element offset. const SCEV *LocalOffset = getMulExpr(IndexExpr, ElementSize, Wrap); // Add the element offset to the running total offset. TotalOffset = getAddExpr(TotalOffset, LocalOffset); } } // Add the total offset from all the GEP indices to the base. return getAddExpr(BaseExpr, TotalOffset, Wrap); } const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS, const SCEV *RHS) { SmallVector Ops = {LHS, RHS}; return getSMaxExpr(Ops); } const SCEV * ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { assert(!Ops.empty() && "Cannot get empty smax!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "SCEVSMaxExpr operand types don't match!"); #endif // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI); // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { ++Idx; assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! ConstantInt *Fold = ConstantInt::get( getContext(), APIntOps::smax(LHSC->getAPInt(), RHSC->getAPInt())); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; LHSC = cast(Ops[0]); } // If we are left with a constant minimum-int, strip it off. if (cast(Ops[0])->getValue()->isMinValue(true)) { Ops.erase(Ops.begin()); --Idx; } else if (cast(Ops[0])->getValue()->isMaxValue(true)) { // If we have an smax with a constant maximum-int, it will always be // maximum-int. return Ops[0]; } if (Ops.size() == 1) return Ops[0]; } // Find the first SMax while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scSMaxExpr) ++Idx; // Check to see if one of the operands is an SMax. If so, expand its operands // onto our operand list, and recurse to simplify. if (Idx < Ops.size()) { bool DeletedSMax = false; while (const SCEVSMaxExpr *SMax = dyn_cast(Ops[Idx])) { Ops.erase(Ops.begin()+Idx); Ops.append(SMax->op_begin(), SMax->op_end()); DeletedSMax = true; } if (DeletedSMax) return getSMaxExpr(Ops); } // Okay, check to see if the same value occurs in the operand list twice. If // so, delete one. Since we sorted the list, these values are required to // be adjacent. for (unsigned i = 0, e = Ops.size()-1; i != e; ++i) // X smax Y smax Y --> X smax Y // X smax Y --> X, if X is always greater than Y if (Ops[i] == Ops[i+1] || isKnownPredicate(ICmpInst::ICMP_SGE, Ops[i], Ops[i+1])) { Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2); --i; --e; } else if (isKnownPredicate(ICmpInst::ICMP_SLE, Ops[i], Ops[i+1])) { Ops.erase(Ops.begin()+i, Ops.begin()+i+1); --i; --e; } if (Ops.size() == 1) return Ops[0]; assert(!Ops.empty() && "Reduced smax down to nothing!"); // Okay, it looks like we really DO need an smax expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(scSMaxExpr); for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); SCEV *S = new (SCEVAllocator) SCEVSMaxExpr(ID.Intern(SCEVAllocator), O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); return S; } const SCEV *ScalarEvolution::getUMaxExpr(const SCEV *LHS, const SCEV *RHS) { SmallVector Ops = {LHS, RHS}; return getUMaxExpr(Ops); } const SCEV * ScalarEvolution::getUMaxExpr(SmallVectorImpl &Ops) { assert(!Ops.empty() && "Cannot get empty umax!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && "SCEVUMaxExpr operand types don't match!"); #endif // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, &LI); // If there are any constants, fold them together. unsigned Idx = 0; if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { ++Idx; assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { // We found two constants, fold them together! ConstantInt *Fold = ConstantInt::get( getContext(), APIntOps::umax(LHSC->getAPInt(), RHSC->getAPInt())); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; LHSC = cast(Ops[0]); } // If we are left with a constant minimum-int, strip it off. if (cast(Ops[0])->getValue()->isMinValue(false)) { Ops.erase(Ops.begin()); --Idx; } else if (cast(Ops[0])->getValue()->isMaxValue(false)) { // If we have an umax with a constant maximum-int, it will always be // maximum-int. return Ops[0]; } if (Ops.size() == 1) return Ops[0]; } // Find the first UMax while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scUMaxExpr) ++Idx; // Check to see if one of the operands is a UMax. If so, expand its operands // onto our operand list, and recurse to simplify. if (Idx < Ops.size()) { bool DeletedUMax = false; while (const SCEVUMaxExpr *UMax = dyn_cast(Ops[Idx])) { Ops.erase(Ops.begin()+Idx); Ops.append(UMax->op_begin(), UMax->op_end()); DeletedUMax = true; } if (DeletedUMax) return getUMaxExpr(Ops); } // Okay, check to see if the same value occurs in the operand list twice. If // so, delete one. Since we sorted the list, these values are required to // be adjacent. for (unsigned i = 0, e = Ops.size()-1; i != e; ++i) // X umax Y umax Y --> X umax Y // X umax Y --> X, if X is always greater than Y if (Ops[i] == Ops[i+1] || isKnownPredicate(ICmpInst::ICMP_UGE, Ops[i], Ops[i+1])) { Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2); --i; --e; } else if (isKnownPredicate(ICmpInst::ICMP_ULE, Ops[i], Ops[i+1])) { Ops.erase(Ops.begin()+i, Ops.begin()+i+1); --i; --e; } if (Ops.size() == 1) return Ops[0]; assert(!Ops.empty() && "Reduced umax down to nothing!"); // Okay, it looks like we really DO need a umax expr. Check to see if we // already have one, otherwise create a new one. FoldingSetNodeID ID; ID.AddInteger(scUMaxExpr); for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = nullptr; if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; const SCEV **O = SCEVAllocator.Allocate(Ops.size()); std::uninitialized_copy(Ops.begin(), Ops.end(), O); SCEV *S = new (SCEVAllocator) SCEVUMaxExpr(ID.Intern(SCEVAllocator), O, Ops.size()); UniqueSCEVs.InsertNode(S, IP); return S; } const SCEV *ScalarEvolution::getSMinExpr(const SCEV *LHS, const SCEV *RHS) { // ~smax(~x, ~y) == smin(x, y). return getNotSCEV(getSMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS))); } const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS, const SCEV *RHS) { // ~umax(~x, ~y) == umin(x, y) return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS))); } const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) { // We can bypass creating a target-independent // constant expression and then folding it back into a ConstantInt. // This is just a compile-time optimization. return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy)); } const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy, StructType *STy, unsigned FieldNo) { // We can bypass creating a target-independent // constant expression and then folding it back into a ConstantInt. // This is just a compile-time optimization. return getConstant( IntTy, getDataLayout().getStructLayout(STy)->getElementOffset(FieldNo)); } const SCEV *ScalarEvolution::getUnknown(Value *V) { // Don't attempt to do anything other than create a SCEVUnknown object // here. createSCEV only calls getUnknown after checking for all other // interesting possibilities, and any other code that calls getUnknown // is doing so in order to hide a value from SCEV canonicalization. FoldingSetNodeID ID; ID.AddInteger(scUnknown); ID.AddPointer(V); void *IP = nullptr; if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) { assert(cast(S)->getValue() == V && "Stale SCEVUnknown in uniquing map!"); return S; } SCEV *S = new (SCEVAllocator) SCEVUnknown(ID.Intern(SCEVAllocator), V, this, FirstUnknown); FirstUnknown = cast(S); UniqueSCEVs.InsertNode(S, IP); return S; } //===----------------------------------------------------------------------===// // Basic SCEV Analysis and PHI Idiom Recognition Code // /// Test if values of the given type are analyzable within the SCEV /// framework. This primarily includes integer types, and it can optionally /// include pointer types if the ScalarEvolution class has access to /// target-specific information. bool ScalarEvolution::isSCEVable(Type *Ty) const { // Integers and pointers are always SCEVable. return Ty->isIntegerTy() || Ty->isPointerTy(); } /// Return the size in bits of the specified type, for which isSCEVable must /// return true. uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const { assert(isSCEVable(Ty) && "Type is not SCEVable!"); return getDataLayout().getTypeSizeInBits(Ty); } /// Return a type with the same bitwidth as the given type and which represents /// how SCEV will treat the given type, for which isSCEVable must return /// true. For pointer types, this is the pointer-sized integer type. Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const { assert(isSCEVable(Ty) && "Type is not SCEVable!"); if (Ty->isIntegerTy()) return Ty; // The only other support type is pointer. assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!"); return getDataLayout().getIntPtrType(Ty); } const SCEV *ScalarEvolution::getCouldNotCompute() { return CouldNotCompute.get(); } bool ScalarEvolution::checkValidity(const SCEV *S) const { // Helper class working with SCEVTraversal to figure out if a SCEV contains // a SCEVUnknown with null value-pointer. FindInvalidSCEVUnknown::FindOne // is set iff if find such SCEVUnknown. // struct FindInvalidSCEVUnknown { bool FindOne; FindInvalidSCEVUnknown() { FindOne = false; } bool follow(const SCEV *S) { switch (static_cast(S->getSCEVType())) { case scConstant: return false; case scUnknown: if (!cast(S)->getValue()) FindOne = true; return false; default: return true; } } bool isDone() const { return FindOne; } }; FindInvalidSCEVUnknown F; SCEVTraversal ST(F); ST.visitAll(S); return !F.FindOne; } namespace { // Helper class working with SCEVTraversal to figure out if a SCEV contains // a sub SCEV of scAddRecExpr type. FindInvalidSCEVUnknown::FoundOne is set // iff if such sub scAddRecExpr type SCEV is found. struct FindAddRecurrence { bool FoundOne; FindAddRecurrence() : FoundOne(false) {} bool follow(const SCEV *S) { switch (static_cast(S->getSCEVType())) { case scAddRecExpr: FoundOne = true; case scConstant: case scUnknown: case scCouldNotCompute: return false; default: return true; } } bool isDone() const { return FoundOne; } }; } bool ScalarEvolution::containsAddRecurrence(const SCEV *S) { HasRecMapType::iterator I = HasRecMap.find_as(S); if (I != HasRecMap.end()) return I->second; FindAddRecurrence F; SCEVTraversal ST(F); ST.visitAll(S); HasRecMap.insert({S, F.FoundOne}); return F.FoundOne; } /// Return the Value set from S. SetVector *ScalarEvolution::getSCEVValues(const SCEV *S) { ExprValueMapType::iterator SI = ExprValueMap.find_as(S); if (SI == ExprValueMap.end()) return nullptr; #ifndef NDEBUG if (VerifySCEVMap) { // Check there is no dangling Value in the set returned. for (const auto &VE : SI->second) assert(ValueExprMap.count(VE)); } #endif return &SI->second; } /// Erase Value from ValueExprMap and ExprValueMap. If ValueExprMap.erase(V) is /// not used together with forgetMemoizedResults(S), eraseValueFromMap should be /// used instead to ensure whenever V->S is removed from ValueExprMap, V is also /// removed from the set of ExprValueMap[S]. void ScalarEvolution::eraseValueFromMap(Value *V) { ValueExprMapType::iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) { const SCEV *S = I->second; SetVector *SV = getSCEVValues(S); // Remove V from the set of ExprValueMap[S] if (SV) SV->remove(V); ValueExprMap.erase(V); } } /// Return an existing SCEV if it exists, otherwise analyze the expression and /// create a new one. const SCEV *ScalarEvolution::getSCEV(Value *V) { assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); const SCEV *S = getExistingSCEV(V); if (S == nullptr) { S = createSCEV(V); // During PHI resolution, it is possible to create two SCEVs for the same // V, so it is needed to double check whether V->S is inserted into // ValueExprMap before insert S->V into ExprValueMap. std::pair Pair = ValueExprMap.insert({SCEVCallbackVH(V, this), S}); if (Pair.second) ExprValueMap[S].insert(V); } return S; } const SCEV *ScalarEvolution::getExistingSCEV(Value *V) { assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); ValueExprMapType::iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) { const SCEV *S = I->second; if (checkValidity(S)) return S; forgetMemoizedResults(S); ValueExprMap.erase(I); } return nullptr; } /// Return a SCEV corresponding to -V = -1*V /// const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags) { if (const SCEVConstant *VC = dyn_cast(V)) return getConstant( cast(ConstantExpr::getNeg(VC->getValue()))); Type *Ty = V->getType(); Ty = getEffectiveSCEVType(Ty); return getMulExpr( V, getConstant(cast(Constant::getAllOnesValue(Ty))), Flags); } /// Return a SCEV corresponding to ~V = -1-V const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) { if (const SCEVConstant *VC = dyn_cast(V)) return getConstant( cast(ConstantExpr::getNot(VC->getValue()))); Type *Ty = V->getType(); Ty = getEffectiveSCEVType(Ty); const SCEV *AllOnes = getConstant(cast(Constant::getAllOnesValue(Ty))); return getMinusSCEV(AllOnes, V); } const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags) { // Fast path: X - X --> 0. if (LHS == RHS) return getZero(LHS->getType()); // We represent LHS - RHS as LHS + (-1)*RHS. This transformation // makes it so that we cannot make much use of NUW. auto AddFlags = SCEV::FlagAnyWrap; const bool RHSIsNotMinSigned = !getSignedRange(RHS).getSignedMin().isMinSignedValue(); if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) { // Let M be the minimum representable signed value. Then (-1)*RHS // signed-wraps if and only if RHS is M. That can happen even for // a NSW subtraction because e.g. (-1)*M signed-wraps even though // -1 - M does not. So to transfer NSW from LHS - RHS to LHS + // (-1)*RHS, we need to prove that RHS != M. // // If LHS is non-negative and we know that LHS - RHS does not // signed-wrap, then RHS cannot be M. So we can rule out signed-wrap // either by proving that RHS > M or that LHS >= 0. if (RHSIsNotMinSigned || isKnownNonNegative(LHS)) { AddFlags = SCEV::FlagNSW; } } // FIXME: Find a correct way to transfer NSW to (-1)*M when LHS - // RHS is NSW and LHS >= 0. // // The difficulty here is that the NSW flag may have been proven // relative to a loop that is to be found in a recurrence in LHS and // not in RHS. Applying NSW to (-1)*M may then let the NSW have a // larger scope than intended. auto NegFlags = RHSIsNotMinSigned ? SCEV::FlagNSW : SCEV::FlagAnyWrap; return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags); } const SCEV * ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot truncate or zero extend with non-integer arguments!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty)) return getTruncateExpr(V, Ty); return getZeroExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getTruncateOrSignExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot truncate or zero extend with non-integer arguments!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty)) return getTruncateExpr(V, Ty); return getSignExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot noop or zero extend with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) && "getNoopOrZeroExtend cannot truncate!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getZeroExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot noop or sign extend with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) && "getNoopOrSignExtend cannot truncate!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getSignExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot noop or any extend with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) && "getNoopOrAnyExtend cannot truncate!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getAnyExtendExpr(V, Ty); } const SCEV * ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) { Type *SrcTy = V->getType(); assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && (Ty->isIntegerTy() || Ty->isPointerTy()) && "Cannot truncate or noop with non-integer arguments!"); assert(getTypeSizeInBits(SrcTy) >= getTypeSizeInBits(Ty) && "getTruncateOrNoop cannot extend!"); if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty)) return V; // No conversion return getTruncateExpr(V, Ty); } const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS, const SCEV *RHS) { const SCEV *PromotedLHS = LHS; const SCEV *PromotedRHS = RHS; if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType())) PromotedRHS = getZeroExtendExpr(RHS, LHS->getType()); else PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType()); return getUMaxExpr(PromotedLHS, PromotedRHS); } const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS, const SCEV *RHS) { const SCEV *PromotedLHS = LHS; const SCEV *PromotedRHS = RHS; if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType())) PromotedRHS = getZeroExtendExpr(RHS, LHS->getType()); else PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType()); return getUMinExpr(PromotedLHS, PromotedRHS); } const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) { // A pointer operand may evaluate to a nonpointer expression, such as null. if (!V->getType()->isPointerTy()) return V; if (const SCEVCastExpr *Cast = dyn_cast(V)) { return getPointerBase(Cast->getOperand()); } else if (const SCEVNAryExpr *NAry = dyn_cast(V)) { const SCEV *PtrOp = nullptr; for (const SCEV *NAryOp : NAry->operands()) { if (NAryOp->getType()->isPointerTy()) { // Cannot find the base of an expression with multiple pointer operands. if (PtrOp) return V; PtrOp = NAryOp; } } if (!PtrOp) return V; return getPointerBase(PtrOp); } return V; } /// Push users of the given Instruction onto the given Worklist. static void PushDefUseChildren(Instruction *I, SmallVectorImpl &Worklist) { // Push the def-use children onto the Worklist stack. for (User *U : I->users()) Worklist.push_back(cast(U)); } void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) { SmallVector Worklist; PushDefUseChildren(PN, Worklist); SmallPtrSet Visited; Visited.insert(PN); while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); if (!Visited.insert(I).second) continue; auto It = ValueExprMap.find_as(static_cast(I)); if (It != ValueExprMap.end()) { const SCEV *Old = It->second; // Short-circuit the def-use traversal if the symbolic name // ceases to appear in expressions. if (Old != SymName && !hasOperand(Old, SymName)) continue; // SCEVUnknown for a PHI either means that it has an unrecognized // structure, it's a PHI that's in the progress of being computed // by createNodeForPHI, or it's a single-value PHI. In the first case, // additional loop trip count information isn't going to change anything. // In the second case, createNodeForPHI will perform the necessary // updates on its own when it gets to that point. In the third, we do // want to forget the SCEVUnknown. if (!isa(I) || !isa(Old) || (I != PN && Old == SymName)) { forgetMemoizedResults(Old); ValueExprMap.erase(It); } } PushDefUseChildren(I, Worklist); } } namespace { class SCEVInitRewriter : public SCEVRewriteVisitor { public: static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE) { SCEVInitRewriter Rewriter(L, SE); const SCEV *Result = Rewriter.visit(S); return Rewriter.isValid() ? Result : SE.getCouldNotCompute(); } SCEVInitRewriter(const Loop *L, ScalarEvolution &SE) : SCEVRewriteVisitor(SE), L(L), Valid(true) {} const SCEV *visitUnknown(const SCEVUnknown *Expr) { if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant)) Valid = false; return Expr; } const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { // Only allow AddRecExprs for this loop. if (Expr->getLoop() == L) return Expr->getStart(); Valid = false; return Expr; } bool isValid() { return Valid; } private: const Loop *L; bool Valid; }; class SCEVShiftRewriter : public SCEVRewriteVisitor { public: static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE) { SCEVShiftRewriter Rewriter(L, SE); const SCEV *Result = Rewriter.visit(S); return Rewriter.isValid() ? Result : SE.getCouldNotCompute(); } SCEVShiftRewriter(const Loop *L, ScalarEvolution &SE) : SCEVRewriteVisitor(SE), L(L), Valid(true) {} const SCEV *visitUnknown(const SCEVUnknown *Expr) { // Only allow AddRecExprs for this loop. if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant)) Valid = false; return Expr; } const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { if (Expr->getLoop() == L && Expr->isAffine()) return SE.getMinusSCEV(Expr, Expr->getStepRecurrence(SE)); Valid = false; return Expr; } bool isValid() { return Valid; } private: const Loop *L; bool Valid; }; } // end anonymous namespace SCEV::NoWrapFlags ScalarEvolution::proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR) { if (!AR->isAffine()) return SCEV::FlagAnyWrap; typedef OverflowingBinaryOperator OBO; SCEV::NoWrapFlags Result = SCEV::FlagAnyWrap; if (!AR->hasNoSignedWrap()) { ConstantRange AddRecRange = getSignedRange(AR); ConstantRange IncRange = getSignedRange(AR->getStepRecurrence(*this)); auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, IncRange, OBO::NoSignedWrap); if (NSWRegion.contains(AddRecRange)) Result = ScalarEvolution::setFlags(Result, SCEV::FlagNSW); } if (!AR->hasNoUnsignedWrap()) { ConstantRange AddRecRange = getUnsignedRange(AR); ConstantRange IncRange = getUnsignedRange(AR->getStepRecurrence(*this)); auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, IncRange, OBO::NoUnsignedWrap); if (NUWRegion.contains(AddRecRange)) Result = ScalarEvolution::setFlags(Result, SCEV::FlagNUW); } return Result; } namespace { /// Represents an abstract binary operation. This may exist as a /// normal instruction or constant expression, or may have been /// derived from an expression tree. struct BinaryOp { unsigned Opcode; Value *LHS; Value *RHS; bool IsNSW; bool IsNUW; /// Op is set if this BinaryOp corresponds to a concrete LLVM instruction or /// constant expression. Operator *Op; explicit BinaryOp(Operator *Op) : Opcode(Op->getOpcode()), LHS(Op->getOperand(0)), RHS(Op->getOperand(1)), IsNSW(false), IsNUW(false), Op(Op) { if (auto *OBO = dyn_cast(Op)) { IsNSW = OBO->hasNoSignedWrap(); IsNUW = OBO->hasNoUnsignedWrap(); } } explicit BinaryOp(unsigned Opcode, Value *LHS, Value *RHS, bool IsNSW = false, bool IsNUW = false) : Opcode(Opcode), LHS(LHS), RHS(RHS), IsNSW(IsNSW), IsNUW(IsNUW), Op(nullptr) {} }; } /// Try to map \p V into a BinaryOp, and return \c None on failure. static Optional MatchBinaryOp(Value *V, DominatorTree &DT) { auto *Op = dyn_cast(V); if (!Op) return None; // Implementation detail: all the cleverness here should happen without // creating new SCEV expressions -- our caller knowns tricks to avoid creating // SCEV expressions when possible, and we should not break that. switch (Op->getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Mul: case Instruction::UDiv: case Instruction::And: case Instruction::Or: case Instruction::AShr: case Instruction::Shl: return BinaryOp(Op); case Instruction::Xor: if (auto *RHSC = dyn_cast(Op->getOperand(1))) // If the RHS of the xor is a signbit, then this is just an add. // Instcombine turns add of signbit into xor as a strength reduction step. if (RHSC->getValue().isSignBit()) return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1)); return BinaryOp(Op); case Instruction::LShr: // Turn logical shift right of a constant into a unsigned divide. if (ConstantInt *SA = dyn_cast(Op->getOperand(1))) { uint32_t BitWidth = cast(Op->getType())->getBitWidth(); // If the shift count is not less than the bitwidth, the result of // the shift is undefined. Don't try to analyze it, because the // resolution chosen here may differ from the resolution chosen in // other parts of the compiler. if (SA->getValue().ult(BitWidth)) { Constant *X = ConstantInt::get(SA->getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue())); return BinaryOp(Instruction::UDiv, Op->getOperand(0), X); } } return BinaryOp(Op); case Instruction::ExtractValue: { auto *EVI = cast(Op); if (EVI->getNumIndices() != 1 || EVI->getIndices()[0] != 0) break; auto *CI = dyn_cast(EVI->getAggregateOperand()); if (!CI) break; if (auto *F = CI->getCalledFunction()) switch (F->getIntrinsicID()) { case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: { if (!isOverflowIntrinsicNoWrap(cast(CI), DT)) return BinaryOp(Instruction::Add, CI->getArgOperand(0), CI->getArgOperand(1)); // Now that we know that all uses of the arithmetic-result component of // CI are guarded by the overflow check, we can go ahead and pretend // that the arithmetic is non-overflowing. if (F->getIntrinsicID() == Intrinsic::sadd_with_overflow) return BinaryOp(Instruction::Add, CI->getArgOperand(0), CI->getArgOperand(1), /* IsNSW = */ true, /* IsNUW = */ false); else return BinaryOp(Instruction::Add, CI->getArgOperand(0), CI->getArgOperand(1), /* IsNSW = */ false, /* IsNUW*/ true); } case Intrinsic::ssub_with_overflow: case Intrinsic::usub_with_overflow: return BinaryOp(Instruction::Sub, CI->getArgOperand(0), CI->getArgOperand(1)); case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: return BinaryOp(Instruction::Mul, CI->getArgOperand(0), CI->getArgOperand(1)); default: break; } } default: break; } return None; } const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) { const Loop *L = LI.getLoopFor(PN->getParent()); if (!L || L->getHeader() != PN->getParent()) return nullptr; // The loop may have multiple entrances or multiple exits; we can analyze // this phi as an addrec if it has a unique entry value and a unique // backedge value. Value *BEValueV = nullptr, *StartValueV = nullptr; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); if (L->contains(PN->getIncomingBlock(i))) { if (!BEValueV) { BEValueV = V; } else if (BEValueV != V) { BEValueV = nullptr; break; } } else if (!StartValueV) { StartValueV = V; } else if (StartValueV != V) { StartValueV = nullptr; break; } } if (BEValueV && StartValueV) { // While we are analyzing this PHI node, handle its value symbolically. const SCEV *SymbolicName = getUnknown(PN); assert(ValueExprMap.find_as(PN) == ValueExprMap.end() && "PHI node already processed?"); ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName}); // Using this symbolic name for the PHI, analyze the value coming around // the back-edge. const SCEV *BEValue = getSCEV(BEValueV); // NOTE: If BEValue is loop invariant, we know that the PHI node just // has a special value for the first iteration of the loop. // If the value coming around the backedge is an add with the symbolic // value we just inserted, then we found a simple induction variable! if (const SCEVAddExpr *Add = dyn_cast(BEValue)) { // If there is a single occurrence of the symbolic value, replace it // with a recurrence. unsigned FoundIndex = Add->getNumOperands(); for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) if (Add->getOperand(i) == SymbolicName) if (FoundIndex == e) { FoundIndex = i; break; } if (FoundIndex != Add->getNumOperands()) { // Create an add with everything but the specified operand. SmallVector Ops; for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) if (i != FoundIndex) Ops.push_back(Add->getOperand(i)); const SCEV *Accum = getAddExpr(Ops); // This is not a valid addrec if the step amount is varying each // loop iteration, but is not itself an addrec in this loop. if (isLoopInvariant(Accum, L) || (isa(Accum) && cast(Accum)->getLoop() == L)) { SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (auto BO = MatchBinaryOp(BEValueV, DT)) { if (BO->Opcode == Instruction::Add && BO->LHS == PN) { if (BO->IsNUW) Flags = setFlags(Flags, SCEV::FlagNUW); if (BO->IsNSW) Flags = setFlags(Flags, SCEV::FlagNSW); } } else if (GEPOperator *GEP = dyn_cast(BEValueV)) { // If the increment is an inbounds GEP, then we know the address // space cannot be wrapped around. We cannot make any guarantee // about signed or unsigned overflow because pointers are // unsigned but we may have a negative index from the base // pointer. We can guarantee that no unsigned wrap occurs if the // indices form a positive value. if (GEP->isInBounds() && GEP->getOperand(0) == PN) { Flags = setFlags(Flags, SCEV::FlagNW); const SCEV *Ptr = getSCEV(GEP->getPointerOperand()); if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr))) Flags = setFlags(Flags, SCEV::FlagNUW); } // We cannot transfer nuw and nsw flags from subtraction // operations -- sub nuw X, Y is not the same as add nuw X, -Y // for instance. } const SCEV *StartVal = getSCEV(StartValueV); const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. forgetSymbolicName(PN, SymbolicName); ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; // We can add Flags to the post-inc expression only if we // know that it us *undefined behavior* for BEValueV to // overflow. if (auto *BEInst = dyn_cast(BEValueV)) if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L)) (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags); return PHISCEV; } } } else { // Otherwise, this could be a loop like this: // i = 0; for (j = 1; ..; ++j) { .... i = j; } // In this case, j = {1,+,1} and BEValue is j. // Because the other in-value of i (0) fits the evolution of BEValue // i really is an addrec evolution. // // We can generalize this saying that i is the shifted value of BEValue // by one iteration: // PHI(f(0), f({1,+,1})) --> f({0,+,1}) const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this); const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this); if (Shifted != getCouldNotCompute() && Start != getCouldNotCompute()) { const SCEV *StartVal = getSCEV(StartValueV); if (Start == StartVal) { // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. forgetSymbolicName(PN, SymbolicName); ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted; return Shifted; } } } // Remove the temporary PHI node SCEV that has been inserted while intending // to create an AddRecExpr for this PHI node. We can not keep this temporary // as it will prevent later (possibly simpler) SCEV expressions to be added // to the ValueExprMap. ValueExprMap.erase(PN); } return nullptr; } // Checks if the SCEV S is available at BB. S is considered available at BB // if S can be materialized at BB without introducing a fault. static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S, BasicBlock *BB) { struct CheckAvailable { bool TraversalDone = false; bool Available = true; const Loop *L = nullptr; // The loop BB is in (can be nullptr) BasicBlock *BB = nullptr; DominatorTree &DT; CheckAvailable(const Loop *L, BasicBlock *BB, DominatorTree &DT) : L(L), BB(BB), DT(DT) {} bool setUnavailable() { TraversalDone = true; Available = false; return false; } bool follow(const SCEV *S) { switch (S->getSCEVType()) { case scConstant: case scTruncate: case scZeroExtend: case scSignExtend: case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: // These expressions are available if their operand(s) is/are. return true; case scAddRecExpr: { // We allow add recurrences that are on the loop BB is in, or some // outer loop. This guarantees availability because the value of the // add recurrence at BB is simply the "current" value of the induction // variable. We can relax this in the future; for instance an add // recurrence on a sibling dominating loop is also available at BB. const auto *ARLoop = cast(S)->getLoop(); if (L && (ARLoop == L || ARLoop->contains(L))) return true; return setUnavailable(); } case scUnknown: { // For SCEVUnknown, we check for simple dominance. const auto *SU = cast(S); Value *V = SU->getValue(); if (isa(V)) return false; if (isa(V) && DT.dominates(cast(V), BB)) return false; return setUnavailable(); } case scUDivExpr: case scCouldNotCompute: // We do not try to smart about these at all. return setUnavailable(); } llvm_unreachable("switch should be fully covered!"); } bool isDone() { return TraversalDone; } }; CheckAvailable CA(L, BB, DT); SCEVTraversal ST(CA); ST.visitAll(S); return CA.Available; } // Try to match a control flow sequence that branches out at BI and merges back // at Merge into a "C ? LHS : RHS" select pattern. Return true on a successful // match. static bool BrPHIToSelect(DominatorTree &DT, BranchInst *BI, PHINode *Merge, Value *&C, Value *&LHS, Value *&RHS) { C = BI->getCondition(); BasicBlockEdge LeftEdge(BI->getParent(), BI->getSuccessor(0)); BasicBlockEdge RightEdge(BI->getParent(), BI->getSuccessor(1)); if (!LeftEdge.isSingleEdge()) return false; assert(RightEdge.isSingleEdge() && "Follows from LeftEdge.isSingleEdge()"); Use &LeftUse = Merge->getOperandUse(0); Use &RightUse = Merge->getOperandUse(1); if (DT.dominates(LeftEdge, LeftUse) && DT.dominates(RightEdge, RightUse)) { LHS = LeftUse; RHS = RightUse; return true; } if (DT.dominates(LeftEdge, RightUse) && DT.dominates(RightEdge, LeftUse)) { LHS = RightUse; RHS = LeftUse; return true; } return false; } const SCEV *ScalarEvolution::createNodeFromSelectLikePHI(PHINode *PN) { if (PN->getNumIncomingValues() == 2) { const Loop *L = LI.getLoopFor(PN->getParent()); // We don't want to break LCSSA, even in a SCEV expression tree. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (LI.getLoopFor(PN->getIncomingBlock(i)) != L) return nullptr; // Try to match // // br %cond, label %left, label %right // left: // br label %merge // right: // br label %merge // merge: // V = phi [ %x, %left ], [ %y, %right ] // // as "select %cond, %x, %y" BasicBlock *IDom = DT[PN->getParent()]->getIDom()->getBlock(); assert(IDom && "At least the entry block should dominate PN"); auto *BI = dyn_cast(IDom->getTerminator()); Value *Cond = nullptr, *LHS = nullptr, *RHS = nullptr; if (BI && BI->isConditional() && BrPHIToSelect(DT, BI, PN, Cond, LHS, RHS) && IsAvailableOnEntry(L, DT, getSCEV(LHS), PN->getParent()) && IsAvailableOnEntry(L, DT, getSCEV(RHS), PN->getParent())) return createNodeForSelectOrPHI(PN, Cond, LHS, RHS); } return nullptr; } const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) { if (const SCEV *S = createAddRecFromPHI(PN)) return S; if (const SCEV *S = createNodeFromSelectLikePHI(PN)) return S; // If the PHI has a single incoming value, follow that value, unless the // PHI's incoming blocks are in a different loop, in which case doing so // risks breaking LCSSA form. Instcombine would normally zap these, but // it doesn't have DominatorTree information, so it may miss cases. if (Value *V = SimplifyInstruction(PN, getDataLayout(), &TLI, &DT, &AC)) if (LI.replacementPreservesLCSSAForm(PN, V)) return getSCEV(V); // If it's not a loop phi, we can't handle it yet. return getUnknown(PN); } const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I, Value *Cond, Value *TrueVal, Value *FalseVal) { // Handle "constant" branch or select. This can occur for instance when a // loop pass transforms an inner loop and moves on to process the outer loop. if (auto *CI = dyn_cast(Cond)) return getSCEV(CI->isOne() ? TrueVal : FalseVal); // Try to match some simple smax or umax patterns. auto *ICI = dyn_cast(Cond); if (!ICI) return getUnknown(I); Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); switch (ICI->getPredicate()) { case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: std::swap(LHS, RHS); // fall through case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: // a >s b ? a+x : b+x -> smax(a, b)+x // a >s b ? b+x : a+x -> smin(a, b)+x if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) { const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), I->getType()); const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), I->getType()); const SCEV *LA = getSCEV(TrueVal); const SCEV *RA = getSCEV(FalseVal); const SCEV *LDiff = getMinusSCEV(LA, LS); const SCEV *RDiff = getMinusSCEV(RA, RS); if (LDiff == RDiff) return getAddExpr(getSMaxExpr(LS, RS), LDiff); LDiff = getMinusSCEV(LA, RS); RDiff = getMinusSCEV(RA, LS); if (LDiff == RDiff) return getAddExpr(getSMinExpr(LS, RS), LDiff); } break; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: std::swap(LHS, RHS); // fall through case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: // a >u b ? a+x : b+x -> umax(a, b)+x // a >u b ? b+x : a+x -> umin(a, b)+x if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) { const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType()); const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), I->getType()); const SCEV *LA = getSCEV(TrueVal); const SCEV *RA = getSCEV(FalseVal); const SCEV *LDiff = getMinusSCEV(LA, LS); const SCEV *RDiff = getMinusSCEV(RA, RS); if (LDiff == RDiff) return getAddExpr(getUMaxExpr(LS, RS), LDiff); LDiff = getMinusSCEV(LA, RS); RDiff = getMinusSCEV(RA, LS); if (LDiff == RDiff) return getAddExpr(getUMinExpr(LS, RS), LDiff); } break; case ICmpInst::ICMP_NE: // n != 0 ? n+x : 1+x -> umax(n, 1)+x if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) && isa(RHS) && cast(RHS)->isZero()) { const SCEV *One = getOne(I->getType()); const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType()); const SCEV *LA = getSCEV(TrueVal); const SCEV *RA = getSCEV(FalseVal); const SCEV *LDiff = getMinusSCEV(LA, LS); const SCEV *RDiff = getMinusSCEV(RA, One); if (LDiff == RDiff) return getAddExpr(getUMaxExpr(One, LS), LDiff); } break; case ICmpInst::ICMP_EQ: // n == 0 ? 1+x : n+x -> umax(n, 1)+x if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) && isa(RHS) && cast(RHS)->isZero()) { const SCEV *One = getOne(I->getType()); const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType()); const SCEV *LA = getSCEV(TrueVal); const SCEV *RA = getSCEV(FalseVal); const SCEV *LDiff = getMinusSCEV(LA, One); const SCEV *RDiff = getMinusSCEV(RA, LS); if (LDiff == RDiff) return getAddExpr(getUMaxExpr(One, LS), LDiff); } break; default: break; } return getUnknown(I); } /// Expand GEP instructions into add and multiply operations. This allows them /// to be analyzed by regular SCEV code. const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) { // Don't attempt to analyze GEPs over unsized objects. if (!GEP->getSourceElementType()->isSized()) return getUnknown(GEP); SmallVector IndexExprs; for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index) IndexExprs.push_back(getSCEV(*Index)); return getGEPExpr(GEP->getSourceElementType(), getSCEV(GEP->getPointerOperand()), IndexExprs, GEP->isInBounds()); } uint32_t ScalarEvolution::GetMinTrailingZeros(const SCEV *S) { if (const SCEVConstant *C = dyn_cast(S)) return C->getAPInt().countTrailingZeros(); if (const SCEVTruncateExpr *T = dyn_cast(S)) return std::min(GetMinTrailingZeros(T->getOperand()), (uint32_t)getTypeSizeInBits(T->getType())); if (const SCEVZeroExtendExpr *E = dyn_cast(S)) { uint32_t OpRes = GetMinTrailingZeros(E->getOperand()); return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ? getTypeSizeInBits(E->getType()) : OpRes; } if (const SCEVSignExtendExpr *E = dyn_cast(S)) { uint32_t OpRes = GetMinTrailingZeros(E->getOperand()); return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ? getTypeSizeInBits(E->getType()) : OpRes; } if (const SCEVAddExpr *A = dyn_cast(S)) { // The result is the min of all operands results. uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0)); for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i) MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i))); return MinOpRes; } if (const SCEVMulExpr *M = dyn_cast(S)) { // The result is the sum of all operands results. uint32_t SumOpRes = GetMinTrailingZeros(M->getOperand(0)); uint32_t BitWidth = getTypeSizeInBits(M->getType()); for (unsigned i = 1, e = M->getNumOperands(); SumOpRes != BitWidth && i != e; ++i) SumOpRes = std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)), BitWidth); return SumOpRes; } if (const SCEVAddRecExpr *A = dyn_cast(S)) { // The result is the min of all operands results. uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0)); for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i) MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i))); return MinOpRes; } if (const SCEVSMaxExpr *M = dyn_cast(S)) { // The result is the min of all operands results. uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0)); for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i) MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i))); return MinOpRes; } if (const SCEVUMaxExpr *M = dyn_cast(S)) { // The result is the min of all operands results. uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0)); for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i) MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i))); return MinOpRes; } if (const SCEVUnknown *U = dyn_cast(S)) { // For a SCEVUnknown, ask ValueTracking. unsigned BitWidth = getTypeSizeInBits(U->getType()); APInt Zeros(BitWidth, 0), Ones(BitWidth, 0); computeKnownBits(U->getValue(), Zeros, Ones, getDataLayout(), 0, &AC, nullptr, &DT); return Zeros.countTrailingOnes(); } // SCEVUDivExpr return 0; } /// Helper method to assign a range to V from metadata present in the IR. static Optional GetRangeFromMetadata(Value *V) { if (Instruction *I = dyn_cast(V)) if (MDNode *MD = I->getMetadata(LLVMContext::MD_range)) return getConstantRangeFromMetadata(*MD); return None; } /// Determine the range for a particular SCEV. If SignHint is /// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges /// with a "cleaner" unsigned (resp. signed) representation. ConstantRange ScalarEvolution::getRange(const SCEV *S, ScalarEvolution::RangeSignHint SignHint) { DenseMap &Cache = SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges; // See if we've computed this range already. DenseMap::iterator I = Cache.find(S); if (I != Cache.end()) return I->second; if (const SCEVConstant *C = dyn_cast(S)) return setRange(C, SignHint, ConstantRange(C->getAPInt())); unsigned BitWidth = getTypeSizeInBits(S->getType()); ConstantRange ConservativeResult(BitWidth, /*isFullSet=*/true); // If the value has known zeros, the maximum value will have those known zeros // as well. uint32_t TZ = GetMinTrailingZeros(S); if (TZ != 0) { if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) ConservativeResult = ConstantRange(APInt::getMinValue(BitWidth), APInt::getMaxValue(BitWidth).lshr(TZ).shl(TZ) + 1); else ConservativeResult = ConstantRange( APInt::getSignedMinValue(BitWidth), APInt::getSignedMaxValue(BitWidth).ashr(TZ).shl(TZ) + 1); } if (const SCEVAddExpr *Add = dyn_cast(S)) { ConstantRange X = getRange(Add->getOperand(0), SignHint); for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i) X = X.add(getRange(Add->getOperand(i), SignHint)); return setRange(Add, SignHint, ConservativeResult.intersectWith(X)); } if (const SCEVMulExpr *Mul = dyn_cast(S)) { ConstantRange X = getRange(Mul->getOperand(0), SignHint); for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i) X = X.multiply(getRange(Mul->getOperand(i), SignHint)); return setRange(Mul, SignHint, ConservativeResult.intersectWith(X)); } if (const SCEVSMaxExpr *SMax = dyn_cast(S)) { ConstantRange X = getRange(SMax->getOperand(0), SignHint); for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i) X = X.smax(getRange(SMax->getOperand(i), SignHint)); return setRange(SMax, SignHint, ConservativeResult.intersectWith(X)); } if (const SCEVUMaxExpr *UMax = dyn_cast(S)) { ConstantRange X = getRange(UMax->getOperand(0), SignHint); for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i) X = X.umax(getRange(UMax->getOperand(i), SignHint)); return setRange(UMax, SignHint, ConservativeResult.intersectWith(X)); } if (const SCEVUDivExpr *UDiv = dyn_cast(S)) { ConstantRange X = getRange(UDiv->getLHS(), SignHint); ConstantRange Y = getRange(UDiv->getRHS(), SignHint); return setRange(UDiv, SignHint, ConservativeResult.intersectWith(X.udiv(Y))); } if (const SCEVZeroExtendExpr *ZExt = dyn_cast(S)) { ConstantRange X = getRange(ZExt->getOperand(), SignHint); return setRange(ZExt, SignHint, ConservativeResult.intersectWith(X.zeroExtend(BitWidth))); } if (const SCEVSignExtendExpr *SExt = dyn_cast(S)) { ConstantRange X = getRange(SExt->getOperand(), SignHint); return setRange(SExt, SignHint, ConservativeResult.intersectWith(X.signExtend(BitWidth))); } if (const SCEVTruncateExpr *Trunc = dyn_cast(S)) { ConstantRange X = getRange(Trunc->getOperand(), SignHint); return setRange(Trunc, SignHint, ConservativeResult.intersectWith(X.truncate(BitWidth))); } if (const SCEVAddRecExpr *AddRec = dyn_cast(S)) { // If there's no unsigned wrap, the value will never be less than its // initial value. if (AddRec->hasNoUnsignedWrap()) if (const SCEVConstant *C = dyn_cast(AddRec->getStart())) if (!C->getValue()->isZero()) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(C->getAPInt(), APInt(BitWidth, 0))); // If there's no signed wrap, and all the operands have the same sign or // zero, the value won't ever change sign. if (AddRec->hasNoSignedWrap()) { bool AllNonNeg = true; bool AllNonPos = true; for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) { if (!isKnownNonNegative(AddRec->getOperand(i))) AllNonNeg = false; if (!isKnownNonPositive(AddRec->getOperand(i))) AllNonPos = false; } if (AllNonNeg) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(APInt(BitWidth, 0), APInt::getSignedMinValue(BitWidth))); else if (AllNonPos) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(APInt::getSignedMinValue(BitWidth), APInt(BitWidth, 1))); } // TODO: non-affine addrec if (AddRec->isAffine()) { const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop()); if (!isa(MaxBECount) && getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) { auto RangeFromAffine = getRangeForAffineAR( AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount, BitWidth); if (!RangeFromAffine.isFullSet()) ConservativeResult = ConservativeResult.intersectWith(RangeFromAffine); auto RangeFromFactoring = getRangeViaFactoring( AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount, BitWidth); if (!RangeFromFactoring.isFullSet()) ConservativeResult = ConservativeResult.intersectWith(RangeFromFactoring); } } return setRange(AddRec, SignHint, ConservativeResult); } if (const SCEVUnknown *U = dyn_cast(S)) { // Check if the IR explicitly contains !range metadata. Optional MDRange = GetRangeFromMetadata(U->getValue()); if (MDRange.hasValue()) ConservativeResult = ConservativeResult.intersectWith(MDRange.getValue()); // Split here to avoid paying the compile-time cost of calling both // computeKnownBits and ComputeNumSignBits. This restriction can be lifted // if needed. const DataLayout &DL = getDataLayout(); if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) { // For a SCEVUnknown, ask ValueTracking. APInt Zeros(BitWidth, 0), Ones(BitWidth, 0); computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, &AC, nullptr, &DT); if (Ones != ~Zeros + 1) ConservativeResult = ConservativeResult.intersectWith(ConstantRange(Ones, ~Zeros + 1)); } else { assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED && "generalize as needed!"); unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, &AC, nullptr, &DT); if (NS > 1) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1), APInt::getSignedMaxValue(BitWidth).ashr(NS - 1) + 1)); } return setRange(U, SignHint, ConservativeResult); } return setRange(S, SignHint, ConservativeResult); } ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start, const SCEV *Step, const SCEV *MaxBECount, unsigned BitWidth) { assert(!isa(MaxBECount) && getTypeSizeInBits(MaxBECount->getType()) <= BitWidth && "Precondition!"); ConstantRange Result(BitWidth, /* isFullSet = */ true); // Check for overflow. This must be done with ConstantRange arithmetic // because we could be called from within the ScalarEvolution overflow // checking code. MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType()); ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount); ConstantRange ZExtMaxBECountRange = MaxBECountRange.zextOrTrunc(BitWidth * 2 + 1); ConstantRange StepSRange = getSignedRange(Step); ConstantRange SExtStepSRange = StepSRange.sextOrTrunc(BitWidth * 2 + 1); ConstantRange StartURange = getUnsignedRange(Start); ConstantRange EndURange = StartURange.add(MaxBECountRange.multiply(StepSRange)); // Check for unsigned overflow. ConstantRange ZExtStartURange = StartURange.zextOrTrunc(BitWidth * 2 + 1); ConstantRange ZExtEndURange = EndURange.zextOrTrunc(BitWidth * 2 + 1); if (ZExtStartURange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) == ZExtEndURange) { APInt Min = APIntOps::umin(StartURange.getUnsignedMin(), EndURange.getUnsignedMin()); APInt Max = APIntOps::umax(StartURange.getUnsignedMax(), EndURange.getUnsignedMax()); bool IsFullRange = Min.isMinValue() && Max.isMaxValue(); if (!IsFullRange) Result = Result.intersectWith(ConstantRange(Min, Max + 1)); } ConstantRange StartSRange = getSignedRange(Start); ConstantRange EndSRange = StartSRange.add(MaxBECountRange.multiply(StepSRange)); // Check for signed overflow. This must be done with ConstantRange // arithmetic because we could be called from within the ScalarEvolution // overflow checking code. ConstantRange SExtStartSRange = StartSRange.sextOrTrunc(BitWidth * 2 + 1); ConstantRange SExtEndSRange = EndSRange.sextOrTrunc(BitWidth * 2 + 1); if (SExtStartSRange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) == SExtEndSRange) { APInt Min = APIntOps::smin(StartSRange.getSignedMin(), EndSRange.getSignedMin()); APInt Max = APIntOps::smax(StartSRange.getSignedMax(), EndSRange.getSignedMax()); bool IsFullRange = Min.isMinSignedValue() && Max.isMaxSignedValue(); if (!IsFullRange) Result = Result.intersectWith(ConstantRange(Min, Max + 1)); } return Result; } ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start, const SCEV *Step, const SCEV *MaxBECount, unsigned BitWidth) { // RangeOf({C?A:B,+,C?P:Q}) == RangeOf(C?{A,+,P}:{B,+,Q}) // == RangeOf({A,+,P}) union RangeOf({B,+,Q}) struct SelectPattern { Value *Condition = nullptr; APInt TrueValue; APInt FalseValue; explicit SelectPattern(ScalarEvolution &SE, unsigned BitWidth, const SCEV *S) { Optional CastOp; APInt Offset(BitWidth, 0); assert(SE.getTypeSizeInBits(S->getType()) == BitWidth && "Should be!"); // Peel off a constant offset: if (auto *SA = dyn_cast(S)) { // In the future we could consider being smarter here and handle // {Start+Step,+,Step} too. if (SA->getNumOperands() != 2 || !isa(SA->getOperand(0))) return; Offset = cast(SA->getOperand(0))->getAPInt(); S = SA->getOperand(1); } // Peel off a cast operation if (auto *SCast = dyn_cast(S)) { CastOp = SCast->getSCEVType(); S = SCast->getOperand(); } using namespace llvm::PatternMatch; auto *SU = dyn_cast(S); const APInt *TrueVal, *FalseVal; if (!SU || !match(SU->getValue(), m_Select(m_Value(Condition), m_APInt(TrueVal), m_APInt(FalseVal)))) { Condition = nullptr; return; } TrueValue = *TrueVal; FalseValue = *FalseVal; // Re-apply the cast we peeled off earlier if (CastOp.hasValue()) switch (*CastOp) { default: llvm_unreachable("Unknown SCEV cast type!"); case scTruncate: TrueValue = TrueValue.trunc(BitWidth); FalseValue = FalseValue.trunc(BitWidth); break; case scZeroExtend: TrueValue = TrueValue.zext(BitWidth); FalseValue = FalseValue.zext(BitWidth); break; case scSignExtend: TrueValue = TrueValue.sext(BitWidth); FalseValue = FalseValue.sext(BitWidth); break; } // Re-apply the constant offset we peeled off earlier TrueValue += Offset; FalseValue += Offset; } bool isRecognized() { return Condition != nullptr; } }; SelectPattern StartPattern(*this, BitWidth, Start); if (!StartPattern.isRecognized()) return ConstantRange(BitWidth, /* isFullSet = */ true); SelectPattern StepPattern(*this, BitWidth, Step); if (!StepPattern.isRecognized()) return ConstantRange(BitWidth, /* isFullSet = */ true); if (StartPattern.Condition != StepPattern.Condition) { // We don't handle this case today; but we could, by considering four // possibilities below instead of two. I'm not sure if there are cases where // that will help over what getRange already does, though. return ConstantRange(BitWidth, /* isFullSet = */ true); } // NB! Calling ScalarEvolution::getConstant is fine, but we should not try to // construct arbitrary general SCEV expressions here. This function is called // from deep in the call stack, and calling getSCEV (on a sext instruction, // say) can end up caching a suboptimal value. // FIXME: without the explicit `this` receiver below, MSVC errors out with // C2352 and C2512 (otherwise it isn't needed). const SCEV *TrueStart = this->getConstant(StartPattern.TrueValue); const SCEV *TrueStep = this->getConstant(StepPattern.TrueValue); const SCEV *FalseStart = this->getConstant(StartPattern.FalseValue); const SCEV *FalseStep = this->getConstant(StepPattern.FalseValue); ConstantRange TrueRange = this->getRangeForAffineAR(TrueStart, TrueStep, MaxBECount, BitWidth); ConstantRange FalseRange = this->getRangeForAffineAR(FalseStart, FalseStep, MaxBECount, BitWidth); return TrueRange.unionWith(FalseRange); } SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) { if (isa(V)) return SCEV::FlagAnyWrap; const BinaryOperator *BinOp = cast(V); // Return early if there are no flags to propagate to the SCEV. SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (BinOp->hasNoUnsignedWrap()) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); if (BinOp->hasNoSignedWrap()) Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); if (Flags == SCEV::FlagAnyWrap) return SCEV::FlagAnyWrap; return isSCEVExprNeverPoison(BinOp) ? Flags : SCEV::FlagAnyWrap; } bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) { // Here we check that I is in the header of the innermost loop containing I, // since we only deal with instructions in the loop header. The actual loop we // need to check later will come from an add recurrence, but getting that // requires computing the SCEV of the operands, which can be expensive. This // check we can do cheaply to rule out some cases early. Loop *InnermostContainingLoop = LI.getLoopFor(I->getParent()); if (InnermostContainingLoop == nullptr || InnermostContainingLoop->getHeader() != I->getParent()) return false; // Only proceed if we can prove that I does not yield poison. if (!isKnownNotFullPoison(I)) return false; // At this point we know that if I is executed, then it does not wrap // according to at least one of NSW or NUW. If I is not executed, then we do // not know if the calculation that I represents would wrap. Multiple // instructions can map to the same SCEV. If we apply NSW or NUW from I to // the SCEV, we must guarantee no wrapping for that SCEV also when it is // derived from other instructions that map to the same SCEV. We cannot make // that guarantee for cases where I is not executed. So we need to find the // loop that I is considered in relation to and prove that I is executed for // every iteration of that loop. That implies that the value that I // calculates does not wrap anywhere in the loop, so then we can apply the // flags to the SCEV. // // We check isLoopInvariant to disambiguate in case we are adding recurrences // from different loops, so that we know which loop to prove that I is // executed in. for (unsigned OpIndex = 0; OpIndex < I->getNumOperands(); ++OpIndex) { + // I could be an extractvalue from a call to an overflow intrinsic. + // TODO: We can do better here in some cases. + if (!isSCEVable(I->getOperand(OpIndex)->getType())) + return false; const SCEV *Op = getSCEV(I->getOperand(OpIndex)); if (auto *AddRec = dyn_cast(Op)) { bool AllOtherOpsLoopInvariant = true; for (unsigned OtherOpIndex = 0; OtherOpIndex < I->getNumOperands(); ++OtherOpIndex) { if (OtherOpIndex != OpIndex) { const SCEV *OtherOp = getSCEV(I->getOperand(OtherOpIndex)); if (!isLoopInvariant(OtherOp, AddRec->getLoop())) { AllOtherOpsLoopInvariant = false; break; } } } if (AllOtherOpsLoopInvariant && isGuaranteedToExecuteForEveryIteration(I, AddRec->getLoop())) return true; } } return false; } bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) { // If we know that \c I can never be poison period, then that's enough. if (isSCEVExprNeverPoison(I)) return true; // For an add recurrence specifically, we assume that infinite loops without // side effects are undefined behavior, and then reason as follows: // // If the add recurrence is poison in any iteration, it is poison on all // future iterations (since incrementing poison yields poison). If the result // of the add recurrence is fed into the loop latch condition and the loop // does not contain any throws or exiting blocks other than the latch, we now // have the ability to "choose" whether the backedge is taken or not (by // choosing a sufficiently evil value for the poison feeding into the branch) // for every iteration including and after the one in which \p I first became // poison. There are two possibilities (let's call the iteration in which \p // I first became poison as K): // // 1. In the set of iterations including and after K, the loop body executes // no side effects. In this case executing the backege an infinte number // of times will yield undefined behavior. // // 2. In the set of iterations including and after K, the loop body executes // at least one side effect. In this case, that specific instance of side // effect is control dependent on poison, which also yields undefined // behavior. auto *ExitingBB = L->getExitingBlock(); auto *LatchBB = L->getLoopLatch(); if (!ExitingBB || !LatchBB || ExitingBB != LatchBB) return false; SmallPtrSet Pushed; SmallVector PoisonStack; // We start by assuming \c I, the post-inc add recurrence, is poison. Only // things that are known to be fully poison under that assumption go on the // PoisonStack. Pushed.insert(I); PoisonStack.push_back(I); bool LatchControlDependentOnPoison = false; while (!PoisonStack.empty() && !LatchControlDependentOnPoison) { const Instruction *Poison = PoisonStack.pop_back_val(); for (auto *PoisonUser : Poison->users()) { if (propagatesFullPoison(cast(PoisonUser))) { if (Pushed.insert(cast(PoisonUser)).second) PoisonStack.push_back(cast(PoisonUser)); } else if (auto *BI = dyn_cast(PoisonUser)) { assert(BI->isConditional() && "Only possibility!"); if (BI->getParent() == LatchBB) { LatchControlDependentOnPoison = true; break; } } } } return LatchControlDependentOnPoison && loopHasNoAbnormalExits(L); } bool ScalarEvolution::loopHasNoAbnormalExits(const Loop *L) { auto Itr = LoopHasNoAbnormalExits.find(L); if (Itr == LoopHasNoAbnormalExits.end()) { auto NoAbnormalExitInBB = [&](BasicBlock *BB) { return all_of(*BB, [](Instruction &I) { return isGuaranteedToTransferExecutionToSuccessor(&I); }); }; auto InsertPair = LoopHasNoAbnormalExits.insert( {L, all_of(L->getBlocks(), NoAbnormalExitInBB)}); assert(InsertPair.second && "We just checked!"); Itr = InsertPair.first; } return Itr->second; } const SCEV *ScalarEvolution::createSCEV(Value *V) { if (!isSCEVable(V->getType())) return getUnknown(V); if (Instruction *I = dyn_cast(V)) { // Don't attempt to analyze instructions in blocks that aren't // reachable. Such instructions don't matter, and they aren't required // to obey basic rules for definitions dominating uses which this // analysis depends on. if (!DT.isReachableFromEntry(I->getParent())) return getUnknown(V); } else if (ConstantInt *CI = dyn_cast(V)) return getConstant(CI); else if (isa(V)) return getZero(V->getType()); else if (GlobalAlias *GA = dyn_cast(V)) return GA->isInterposable() ? getUnknown(V) : getSCEV(GA->getAliasee()); else if (!isa(V)) return getUnknown(V); Operator *U = cast(V); if (auto BO = MatchBinaryOp(U, DT)) { switch (BO->Opcode) { case Instruction::Add: { // The simple thing to do would be to just call getSCEV on both operands // and call getAddExpr with the result. However if we're looking at a // bunch of things all added together, this can be quite inefficient, // because it leads to N-1 getAddExpr calls for N ultimate operands. // Instead, gather up all the operands and make a single getAddExpr call. // LLVM IR canonical form means we need only traverse the left operands. SmallVector AddOps; do { if (BO->Op) { if (auto *OpSCEV = getExistingSCEV(BO->Op)) { AddOps.push_back(OpSCEV); break; } // If a NUW or NSW flag can be applied to the SCEV for this // addition, then compute the SCEV for this addition by itself // with a separate call to getAddExpr. We need to do that // instead of pushing the operands of the addition onto AddOps, // since the flags are only known to apply to this particular // addition - they may not apply to other additions that can be // formed with operands from AddOps. const SCEV *RHS = getSCEV(BO->RHS); SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op); if (Flags != SCEV::FlagAnyWrap) { const SCEV *LHS = getSCEV(BO->LHS); if (BO->Opcode == Instruction::Sub) AddOps.push_back(getMinusSCEV(LHS, RHS, Flags)); else AddOps.push_back(getAddExpr(LHS, RHS, Flags)); break; } } if (BO->Opcode == Instruction::Sub) AddOps.push_back(getNegativeSCEV(getSCEV(BO->RHS))); else AddOps.push_back(getSCEV(BO->RHS)); auto NewBO = MatchBinaryOp(BO->LHS, DT); if (!NewBO || (NewBO->Opcode != Instruction::Add && NewBO->Opcode != Instruction::Sub)) { AddOps.push_back(getSCEV(BO->LHS)); break; } BO = NewBO; } while (true); return getAddExpr(AddOps); } case Instruction::Mul: { SmallVector MulOps; do { if (BO->Op) { if (auto *OpSCEV = getExistingSCEV(BO->Op)) { MulOps.push_back(OpSCEV); break; } SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op); if (Flags != SCEV::FlagAnyWrap) { MulOps.push_back( getMulExpr(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags)); break; } } MulOps.push_back(getSCEV(BO->RHS)); auto NewBO = MatchBinaryOp(BO->LHS, DT); if (!NewBO || NewBO->Opcode != Instruction::Mul) { MulOps.push_back(getSCEV(BO->LHS)); break; } BO = NewBO; } while (true); return getMulExpr(MulOps); } case Instruction::UDiv: return getUDivExpr(getSCEV(BO->LHS), getSCEV(BO->RHS)); case Instruction::Sub: { SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; if (BO->Op) Flags = getNoWrapFlagsFromUB(BO->Op); return getMinusSCEV(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags); } case Instruction::And: // For an expression like x&255 that merely masks off the high bits, // use zext(trunc(x)) as the SCEV expression. if (ConstantInt *CI = dyn_cast(BO->RHS)) { if (CI->isNullValue()) return getSCEV(BO->RHS); if (CI->isAllOnesValue()) return getSCEV(BO->LHS); const APInt &A = CI->getValue(); // Instcombine's ShrinkDemandedConstant may strip bits out of // constants, obscuring what would otherwise be a low-bits mask. // Use computeKnownBits to compute what ShrinkDemandedConstant // knew about to reconstruct a low-bits mask value. unsigned LZ = A.countLeadingZeros(); unsigned TZ = A.countTrailingZeros(); unsigned BitWidth = A.getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); computeKnownBits(BO->LHS, KnownZero, KnownOne, getDataLayout(), 0, &AC, nullptr, &DT); APInt EffectiveMask = APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ); if ((LZ != 0 || TZ != 0) && !((~A & ~KnownZero) & EffectiveMask)) { const SCEV *MulCount = getConstant(ConstantInt::get( getContext(), APInt::getOneBitSet(BitWidth, TZ))); return getMulExpr( getZeroExtendExpr( getTruncateExpr( getUDivExactExpr(getSCEV(BO->LHS), MulCount), IntegerType::get(getContext(), BitWidth - LZ - TZ)), BO->LHS->getType()), MulCount); } } break; case Instruction::Or: // If the RHS of the Or is a constant, we may have something like: // X*4+1 which got turned into X*4|1. Handle this as an Add so loop // optimizations will transparently handle this case. // // In order for this transformation to be safe, the LHS must be of the // form X*(2^n) and the Or constant must be less than 2^n. if (ConstantInt *CI = dyn_cast(BO->RHS)) { const SCEV *LHS = getSCEV(BO->LHS); const APInt &CIVal = CI->getValue(); if (GetMinTrailingZeros(LHS) >= (CIVal.getBitWidth() - CIVal.countLeadingZeros())) { // Build a plain add SCEV. const SCEV *S = getAddExpr(LHS, getSCEV(CI)); // If the LHS of the add was an addrec and it has no-wrap flags, // transfer the no-wrap flags, since an or won't introduce a wrap. if (const SCEVAddRecExpr *NewAR = dyn_cast(S)) { const SCEVAddRecExpr *OldAR = cast(LHS); const_cast(NewAR)->setNoWrapFlags( OldAR->getNoWrapFlags()); } return S; } } break; case Instruction::Xor: if (ConstantInt *CI = dyn_cast(BO->RHS)) { // If the RHS of xor is -1, then this is a not operation. if (CI->isAllOnesValue()) return getNotSCEV(getSCEV(BO->LHS)); // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask. // This is a variant of the check for xor with -1, and it handles // the case where instcombine has trimmed non-demanded bits out // of an xor with -1. if (auto *LBO = dyn_cast(BO->LHS)) if (ConstantInt *LCI = dyn_cast(LBO->getOperand(1))) if (LBO->getOpcode() == Instruction::And && LCI->getValue() == CI->getValue()) if (const SCEVZeroExtendExpr *Z = dyn_cast(getSCEV(BO->LHS))) { Type *UTy = BO->LHS->getType(); const SCEV *Z0 = Z->getOperand(); Type *Z0Ty = Z0->getType(); unsigned Z0TySize = getTypeSizeInBits(Z0Ty); // If C is a low-bits mask, the zero extend is serving to // mask off the high bits. Complement the operand and // re-apply the zext. if (APIntOps::isMask(Z0TySize, CI->getValue())) return getZeroExtendExpr(getNotSCEV(Z0), UTy); // If C is a single bit, it may be in the sign-bit position // before the zero-extend. In this case, represent the xor // using an add, which is equivalent, and re-apply the zext. APInt Trunc = CI->getValue().trunc(Z0TySize); if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() && Trunc.isSignBit()) return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)), UTy); } } break; case Instruction::Shl: // Turn shift left of a constant amount into a multiply. if (ConstantInt *SA = dyn_cast(BO->RHS)) { uint32_t BitWidth = cast(SA->getType())->getBitWidth(); // If the shift count is not less than the bitwidth, the result of // the shift is undefined. Don't try to analyze it, because the // resolution chosen here may differ from the resolution chosen in // other parts of the compiler. if (SA->getValue().uge(BitWidth)) break; // It is currently not resolved how to interpret NSW for left // shift by BitWidth - 1, so we avoid applying flags in that // case. Remove this check (or this comment) once the situation // is resolved. See // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html // and http://reviews.llvm.org/D8890 . auto Flags = SCEV::FlagAnyWrap; if (BO->Op && SA->getValue().ult(BitWidth - 1)) Flags = getNoWrapFlagsFromUB(BO->Op); Constant *X = ConstantInt::get(getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue())); return getMulExpr(getSCEV(BO->LHS), getSCEV(X), Flags); } break; case Instruction::AShr: // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression. if (ConstantInt *CI = dyn_cast(BO->RHS)) if (Operator *L = dyn_cast(BO->LHS)) if (L->getOpcode() == Instruction::Shl && L->getOperand(1) == BO->RHS) { uint64_t BitWidth = getTypeSizeInBits(BO->LHS->getType()); // If the shift count is not less than the bitwidth, the result of // the shift is undefined. Don't try to analyze it, because the // resolution chosen here may differ from the resolution chosen in // other parts of the compiler. if (CI->getValue().uge(BitWidth)) break; uint64_t Amt = BitWidth - CI->getZExtValue(); if (Amt == BitWidth) return getSCEV(L->getOperand(0)); // shift by zero --> noop return getSignExtendExpr( getTruncateExpr(getSCEV(L->getOperand(0)), IntegerType::get(getContext(), Amt)), BO->LHS->getType()); } break; } } switch (U->getOpcode()) { case Instruction::Trunc: return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType()); case Instruction::ZExt: return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType()); case Instruction::SExt: return getSignExtendExpr(getSCEV(U->getOperand(0)), U->getType()); case Instruction::BitCast: // BitCasts are no-op casts so we just eliminate the cast. if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType())) return getSCEV(U->getOperand(0)); break; // It's tempting to handle inttoptr and ptrtoint as no-ops, however this can // lead to pointer expressions which cannot safely be expanded to GEPs, // because ScalarEvolution doesn't respect the GEP aliasing rules when // simplifying integer expressions. case Instruction::GetElementPtr: return createNodeForGEP(cast(U)); case Instruction::PHI: return createNodeForPHI(cast(U)); case Instruction::Select: // U can also be a select constant expr, which let fall through. Since // createNodeForSelect only works for a condition that is an `ICmpInst`, and // constant expressions cannot have instructions as operands, we'd have // returned getUnknown for a select constant expressions anyway. if (isa(U)) return createNodeForSelectOrPHI(cast(U), U->getOperand(0), U->getOperand(1), U->getOperand(2)); break; case Instruction::Call: case Instruction::Invoke: if (Value *RV = CallSite(U).getReturnedArgOperand()) return getSCEV(RV); break; } return getUnknown(V); } //===----------------------------------------------------------------------===// // Iteration Count Computation Code // unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) { if (BasicBlock *ExitingBB = L->getExitingBlock()) return getSmallConstantTripCount(L, ExitingBB); // No trip count information for multiple exits. return 0; } unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) { assert(ExitingBlock && "Must pass a non-null exiting block!"); assert(L->isLoopExiting(ExitingBlock) && "Exiting block must actually branch out of the loop!"); const SCEVConstant *ExitCount = dyn_cast(getExitCount(L, ExitingBlock)); if (!ExitCount) return 0; ConstantInt *ExitConst = ExitCount->getValue(); // Guard against huge trip counts. if (ExitConst->getValue().getActiveBits() > 32) return 0; // In case of integer overflow, this returns 0, which is correct. return ((unsigned)ExitConst->getZExtValue()) + 1; } unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) { if (BasicBlock *ExitingBB = L->getExitingBlock()) return getSmallConstantTripMultiple(L, ExitingBB); // No trip multiple information for multiple exits. return 0; } /// Returns the largest constant divisor of the trip count of this loop as a /// normal unsigned value, if possible. This means that the actual trip count is /// always a multiple of the returned value (don't forget the trip count could /// very well be zero as well!). /// /// Returns 1 if the trip count is unknown or not guaranteed to be the /// multiple of a constant (which is also the case if the trip count is simply /// constant, use getSmallConstantTripCount for that case), Will also return 1 /// if the trip count is very large (>= 2^32). /// /// As explained in the comments for getSmallConstantTripCount, this assumes /// that control exits the loop via ExitingBlock. unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) { assert(ExitingBlock && "Must pass a non-null exiting block!"); assert(L->isLoopExiting(ExitingBlock) && "Exiting block must actually branch out of the loop!"); const SCEV *ExitCount = getExitCount(L, ExitingBlock); if (ExitCount == getCouldNotCompute()) return 1; // Get the trip count from the BE count by adding 1. const SCEV *TCMul = getAddExpr(ExitCount, getOne(ExitCount->getType())); // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt // to factor simple cases. if (const SCEVMulExpr *Mul = dyn_cast(TCMul)) TCMul = Mul->getOperand(0); const SCEVConstant *MulC = dyn_cast(TCMul); if (!MulC) return 1; ConstantInt *Result = MulC->getValue(); // Guard against huge trip counts (this requires checking // for zero to handle the case where the trip count == -1 and the // addition wraps). if (!Result || Result->getValue().getActiveBits() > 32 || Result->getValue().getActiveBits() == 0) return 1; return (unsigned)Result->getZExtValue(); } /// Get the expression for the number of loop iterations for which this loop is /// guaranteed not to exit via ExitingBlock. Otherwise return /// SCEVCouldNotCompute. const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) { return getBackedgeTakenInfo(L).getExact(ExitingBlock, this); } const SCEV * ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L, SCEVUnionPredicate &Preds) { return getPredicatedBackedgeTakenInfo(L).getExact(this, &Preds); } const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L) { return getBackedgeTakenInfo(L).getExact(this); } /// Similar to getBackedgeTakenCount, except return the least SCEV value that is /// known never to be less than the actual backedge taken count. const SCEV *ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) { return getBackedgeTakenInfo(L).getMax(this); } /// Push PHI nodes in the header of the given loop onto the given Worklist. static void PushLoopPHIs(const Loop *L, SmallVectorImpl &Worklist) { BasicBlock *Header = L->getHeader(); // Push all Loop-header PHIs onto the Worklist stack. for (BasicBlock::iterator I = Header->begin(); PHINode *PN = dyn_cast(I); ++I) Worklist.push_back(PN); } const ScalarEvolution::BackedgeTakenInfo & ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) { auto &BTI = getBackedgeTakenInfo(L); if (BTI.hasFullInfo()) return BTI; auto Pair = PredicatedBackedgeTakenCounts.insert({L, BackedgeTakenInfo()}); if (!Pair.second) return Pair.first->second; BackedgeTakenInfo Result = computeBackedgeTakenCount(L, /*AllowPredicates=*/true); return PredicatedBackedgeTakenCounts.find(L)->second = Result; } const ScalarEvolution::BackedgeTakenInfo & ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { // Initially insert an invalid entry for this loop. If the insertion // succeeds, proceed to actually compute a backedge-taken count and // update the value. The temporary CouldNotCompute value tells SCEV // code elsewhere that it shouldn't attempt to request a new // backedge-taken count, which could result in infinite recursion. std::pair::iterator, bool> Pair = BackedgeTakenCounts.insert({L, BackedgeTakenInfo()}); if (!Pair.second) return Pair.first->second; // computeBackedgeTakenCount may allocate memory for its result. Inserting it // into the BackedgeTakenCounts map transfers ownership. Otherwise, the result // must be cleared in this scope. BackedgeTakenInfo Result = computeBackedgeTakenCount(L); if (Result.getExact(this) != getCouldNotCompute()) { assert(isLoopInvariant(Result.getExact(this), L) && isLoopInvariant(Result.getMax(this), L) && "Computed backedge-taken count isn't loop invariant for loop!"); ++NumTripCountsComputed; } else if (Result.getMax(this) == getCouldNotCompute() && isa(L->getHeader()->begin())) { // Only count loops that have phi nodes as not being computable. ++NumTripCountsNotComputed; } // Now that we know more about the trip count for this loop, forget any // existing SCEV values for PHI nodes in this loop since they are only // conservative estimates made without the benefit of trip count // information. This is similar to the code in forgetLoop, except that // it handles SCEVUnknown PHI nodes specially. if (Result.hasAnyInfo()) { SmallVector Worklist; PushLoopPHIs(L, Worklist); SmallPtrSet Visited; while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); if (!Visited.insert(I).second) continue; ValueExprMapType::iterator It = ValueExprMap.find_as(static_cast(I)); if (It != ValueExprMap.end()) { const SCEV *Old = It->second; // SCEVUnknown for a PHI either means that it has an unrecognized // structure, or it's a PHI that's in the progress of being computed // by createNodeForPHI. In the former case, additional loop trip // count information isn't going to change anything. In the later // case, createNodeForPHI will perform the necessary updates on its // own when it gets to that point. if (!isa(I) || !isa(Old)) { forgetMemoizedResults(Old); ValueExprMap.erase(It); } if (PHINode *PN = dyn_cast(I)) ConstantEvolutionLoopExitValue.erase(PN); } PushDefUseChildren(I, Worklist); } } // Re-lookup the insert position, since the call to // computeBackedgeTakenCount above could result in a // recusive call to getBackedgeTakenInfo (on a different // loop), which would invalidate the iterator computed // earlier. return BackedgeTakenCounts.find(L)->second = Result; } void ScalarEvolution::forgetLoop(const Loop *L) { // Drop any stored trip count value. auto RemoveLoopFromBackedgeMap = [L](DenseMap &Map) { auto BTCPos = Map.find(L); if (BTCPos != Map.end()) { BTCPos->second.clear(); Map.erase(BTCPos); } }; RemoveLoopFromBackedgeMap(BackedgeTakenCounts); RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts); // Drop information about expressions based on loop-header PHIs. SmallVector Worklist; PushLoopPHIs(L, Worklist); SmallPtrSet Visited; while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); if (!Visited.insert(I).second) continue; ValueExprMapType::iterator It = ValueExprMap.find_as(static_cast(I)); if (It != ValueExprMap.end()) { forgetMemoizedResults(It->second); ValueExprMap.erase(It); if (PHINode *PN = dyn_cast(I)) ConstantEvolutionLoopExitValue.erase(PN); } PushDefUseChildren(I, Worklist); } // Forget all contained loops too, to avoid dangling entries in the // ValuesAtScopes map. for (Loop *I : *L) forgetLoop(I); LoopHasNoAbnormalExits.erase(L); } void ScalarEvolution::forgetValue(Value *V) { Instruction *I = dyn_cast(V); if (!I) return; // Drop information about expressions based on loop-header PHIs. SmallVector Worklist; Worklist.push_back(I); SmallPtrSet Visited; while (!Worklist.empty()) { I = Worklist.pop_back_val(); if (!Visited.insert(I).second) continue; ValueExprMapType::iterator It = ValueExprMap.find_as(static_cast(I)); if (It != ValueExprMap.end()) { forgetMemoizedResults(It->second); ValueExprMap.erase(It); if (PHINode *PN = dyn_cast(I)) ConstantEvolutionLoopExitValue.erase(PN); } PushDefUseChildren(I, Worklist); } } /// Get the exact loop backedge taken count considering all loop exits. A /// computable result can only be returned for loops with a single exit. /// Returning the minimum taken count among all exits is incorrect because one /// of the loop's exit limit's may have been skipped. howFarToZero assumes that /// the limit of each loop test is never skipped. This is a valid assumption as /// long as the loop exits via that test. For precise results, it is the /// caller's responsibility to specify the relevant loop exit using /// getExact(ExitingBlock, SE). const SCEV * ScalarEvolution::BackedgeTakenInfo::getExact( ScalarEvolution *SE, SCEVUnionPredicate *Preds) const { // If any exits were not computable, the loop is not computable. if (!ExitNotTaken.isCompleteList()) return SE->getCouldNotCompute(); // We need exactly one computable exit. if (!ExitNotTaken.ExitingBlock) return SE->getCouldNotCompute(); assert(ExitNotTaken.ExactNotTaken && "uninitialized not-taken info"); const SCEV *BECount = nullptr; for (auto &ENT : ExitNotTaken) { assert(ENT.ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV"); if (!BECount) BECount = ENT.ExactNotTaken; else if (BECount != ENT.ExactNotTaken) return SE->getCouldNotCompute(); if (Preds && ENT.getPred()) Preds->add(ENT.getPred()); assert((Preds || ENT.hasAlwaysTruePred()) && "Predicate should be always true!"); } assert(BECount && "Invalid not taken count for loop exit"); return BECount; } /// Get the exact not taken count for this loop exit. const SCEV * ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock, ScalarEvolution *SE) const { for (auto &ENT : ExitNotTaken) if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePred()) return ENT.ExactNotTaken; return SE->getCouldNotCompute(); } /// getMax - Get the max backedge taken count for the loop. const SCEV * ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const { for (auto &ENT : ExitNotTaken) if (!ENT.hasAlwaysTruePred()) return SE->getCouldNotCompute(); return Max ? Max : SE->getCouldNotCompute(); } bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S, ScalarEvolution *SE) const { if (Max && Max != SE->getCouldNotCompute() && SE->hasOperand(Max, S)) return true; if (!ExitNotTaken.ExitingBlock) return false; for (auto &ENT : ExitNotTaken) if (ENT.ExactNotTaken != SE->getCouldNotCompute() && SE->hasOperand(ENT.ExactNotTaken, S)) return true; return false; } /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each /// computable exit into a persistent ExitNotTakenInfo array. ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo( SmallVectorImpl &ExitCounts, bool Complete, const SCEV *MaxCount) : Max(MaxCount) { if (!Complete) ExitNotTaken.setIncomplete(); unsigned NumExits = ExitCounts.size(); if (NumExits == 0) return; ExitNotTaken.ExitingBlock = ExitCounts[0].ExitBlock; ExitNotTaken.ExactNotTaken = ExitCounts[0].Taken; // Determine the number of ExitNotTakenExtras structures that we need. unsigned ExtraInfoSize = 0; if (NumExits > 1) ExtraInfoSize = 1 + std::count_if(std::next(ExitCounts.begin()), ExitCounts.end(), [](EdgeInfo &Entry) { return !Entry.Pred.isAlwaysTrue(); }); else if (!ExitCounts[0].Pred.isAlwaysTrue()) ExtraInfoSize = 1; ExitNotTakenExtras *ENT = nullptr; // Allocate the ExitNotTakenExtras structures and initialize the first // element (ExitNotTaken). if (ExtraInfoSize > 0) { ENT = new ExitNotTakenExtras[ExtraInfoSize]; ExitNotTaken.ExtraInfo = &ENT[0]; *ExitNotTaken.getPred() = std::move(ExitCounts[0].Pred); } if (NumExits == 1) return; assert(ENT && "ExitNotTakenExtras is NULL while having more than one exit"); auto &Exits = ExitNotTaken.ExtraInfo->Exits; // Handle the rare case of multiple computable exits. for (unsigned i = 1, PredPos = 1; i < NumExits; ++i) { ExitNotTakenExtras *Ptr = nullptr; if (!ExitCounts[i].Pred.isAlwaysTrue()) { Ptr = &ENT[PredPos++]; Ptr->Pred = std::move(ExitCounts[i].Pred); } Exits.emplace_back(ExitCounts[i].ExitBlock, ExitCounts[i].Taken, Ptr); } } /// Invalidate this result and free the ExitNotTakenInfo array. void ScalarEvolution::BackedgeTakenInfo::clear() { ExitNotTaken.ExitingBlock = nullptr; ExitNotTaken.ExactNotTaken = nullptr; delete[] ExitNotTaken.ExtraInfo; } /// Compute the number of times the backedge of the specified loop will execute. ScalarEvolution::BackedgeTakenInfo ScalarEvolution::computeBackedgeTakenCount(const Loop *L, bool AllowPredicates) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); SmallVector ExitCounts; bool CouldComputeBECount = true; BasicBlock *Latch = L->getLoopLatch(); // may be NULL. const SCEV *MustExitMaxBECount = nullptr; const SCEV *MayExitMaxBECount = nullptr; // Compute the ExitLimit for each loop exit. Use this to populate ExitCounts // and compute maxBECount. // Do a union of all the predicates here. for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { BasicBlock *ExitBB = ExitingBlocks[i]; ExitLimit EL = computeExitLimit(L, ExitBB, AllowPredicates); assert((AllowPredicates || EL.Pred.isAlwaysTrue()) && "Predicated exit limit when predicates are not allowed!"); // 1. For each exit that can be computed, add an entry to ExitCounts. // CouldComputeBECount is true only if all exits can be computed. if (EL.Exact == getCouldNotCompute()) // We couldn't compute an exact value for this exit, so // we won't be able to compute an exact value for the loop. CouldComputeBECount = false; else ExitCounts.emplace_back(EdgeInfo(ExitBB, EL.Exact, EL.Pred)); // 2. Derive the loop's MaxBECount from each exit's max number of // non-exiting iterations. Partition the loop exits into two kinds: // LoopMustExits and LoopMayExits. // // If the exit dominates the loop latch, it is a LoopMustExit otherwise it // is a LoopMayExit. If any computable LoopMustExit is found, then // MaxBECount is the minimum EL.Max of computable LoopMustExits. Otherwise, // MaxBECount is conservatively the maximum EL.Max, where CouldNotCompute is // considered greater than any computable EL.Max. if (EL.Max != getCouldNotCompute() && Latch && DT.dominates(ExitBB, Latch)) { if (!MustExitMaxBECount) MustExitMaxBECount = EL.Max; else { MustExitMaxBECount = getUMinFromMismatchedTypes(MustExitMaxBECount, EL.Max); } } else if (MayExitMaxBECount != getCouldNotCompute()) { if (!MayExitMaxBECount || EL.Max == getCouldNotCompute()) MayExitMaxBECount = EL.Max; else { MayExitMaxBECount = getUMaxFromMismatchedTypes(MayExitMaxBECount, EL.Max); } } } const SCEV *MaxBECount = MustExitMaxBECount ? MustExitMaxBECount : (MayExitMaxBECount ? MayExitMaxBECount : getCouldNotCompute()); return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock, bool AllowPredicates) { // Okay, we've chosen an exiting block. See what condition causes us to exit // at this block and remember the exit block and whether all other targets // lead to the loop header. bool MustExecuteLoopHeader = true; BasicBlock *Exit = nullptr; for (auto *SBB : successors(ExitingBlock)) if (!L->contains(SBB)) { if (Exit) // Multiple exit successors. return getCouldNotCompute(); Exit = SBB; } else if (SBB != L->getHeader()) { MustExecuteLoopHeader = false; } // At this point, we know we have a conditional branch that determines whether // the loop is exited. However, we don't know if the branch is executed each // time through the loop. If not, then the execution count of the branch will // not be equal to the trip count of the loop. // // Currently we check for this by checking to see if the Exit branch goes to // the loop header. If so, we know it will always execute the same number of // times as the loop. We also handle the case where the exit block *is* the // loop header. This is common for un-rotated loops. // // If both of those tests fail, walk up the unique predecessor chain to the // header, stopping if there is an edge that doesn't exit the loop. If the // header is reached, the execution count of the branch will be equal to the // trip count of the loop. // // More extensive analysis could be done to handle more cases here. // if (!MustExecuteLoopHeader && ExitingBlock != L->getHeader()) { // The simple checks failed, try climbing the unique predecessor chain // up to the header. bool Ok = false; for (BasicBlock *BB = ExitingBlock; BB; ) { BasicBlock *Pred = BB->getUniquePredecessor(); if (!Pred) return getCouldNotCompute(); TerminatorInst *PredTerm = Pred->getTerminator(); for (const BasicBlock *PredSucc : PredTerm->successors()) { if (PredSucc == BB) continue; // If the predecessor has a successor that isn't BB and isn't // outside the loop, assume the worst. if (L->contains(PredSucc)) return getCouldNotCompute(); } if (Pred == L->getHeader()) { Ok = true; break; } BB = Pred; } if (!Ok) return getCouldNotCompute(); } bool IsOnlyExit = (L->getExitingBlock() != nullptr); TerminatorInst *Term = ExitingBlock->getTerminator(); if (BranchInst *BI = dyn_cast(Term)) { assert(BI->isConditional() && "If unconditional, it can't be in loop!"); // Proceed to the next level to examine the exit condition expression. return computeExitLimitFromCond( L, BI->getCondition(), BI->getSuccessor(0), BI->getSuccessor(1), /*ControlsExit=*/IsOnlyExit, AllowPredicates); } if (SwitchInst *SI = dyn_cast(Term)) return computeExitLimitFromSingleExitSwitch(L, SI, Exit, /*ControlsExit=*/IsOnlyExit); return getCouldNotCompute(); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCond(const Loop *L, Value *ExitCond, BasicBlock *TBB, BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) { // Check if the controlling expression for this loop is an And or Or. if (BinaryOperator *BO = dyn_cast(ExitCond)) { if (BO->getOpcode() == Instruction::And) { // Recurse on the operands of the and. bool EitherMayExit = L->contains(TBB); ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit, AllowPredicates); ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit, AllowPredicates); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); if (EitherMayExit) { // Both conditions must be true for the loop to continue executing. // Choose the less conservative count. if (EL0.Exact == getCouldNotCompute() || EL1.Exact == getCouldNotCompute()) BECount = getCouldNotCompute(); else BECount = getUMinFromMismatchedTypes(EL0.Exact, EL1.Exact); if (EL0.Max == getCouldNotCompute()) MaxBECount = EL1.Max; else if (EL1.Max == getCouldNotCompute()) MaxBECount = EL0.Max; else MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max); } else { // Both conditions must be true at the same time for the loop to exit. // For now, be conservative. assert(L->contains(FBB) && "Loop block has no successor in loop!"); if (EL0.Max == EL1.Max) MaxBECount = EL0.Max; if (EL0.Exact == EL1.Exact) BECount = EL0.Exact; } SCEVUnionPredicate NP; NP.add(&EL0.Pred); NP.add(&EL1.Pred); // There are cases (e.g. PR26207) where computeExitLimitFromCond is able // to be more aggressive when computing BECount than when computing // MaxBECount. In these cases it is possible for EL0.Exact and EL1.Exact // to match, but for EL0.Max and EL1.Max to not. if (isa(MaxBECount) && !isa(BECount)) MaxBECount = BECount; return ExitLimit(BECount, MaxBECount, NP); } if (BO->getOpcode() == Instruction::Or) { // Recurse on the operands of the or. bool EitherMayExit = L->contains(FBB); ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit, AllowPredicates); ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit, AllowPredicates); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); if (EitherMayExit) { // Both conditions must be false for the loop to continue executing. // Choose the less conservative count. if (EL0.Exact == getCouldNotCompute() || EL1.Exact == getCouldNotCompute()) BECount = getCouldNotCompute(); else BECount = getUMinFromMismatchedTypes(EL0.Exact, EL1.Exact); if (EL0.Max == getCouldNotCompute()) MaxBECount = EL1.Max; else if (EL1.Max == getCouldNotCompute()) MaxBECount = EL0.Max; else MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max); } else { // Both conditions must be false at the same time for the loop to exit. // For now, be conservative. assert(L->contains(TBB) && "Loop block has no successor in loop!"); if (EL0.Max == EL1.Max) MaxBECount = EL0.Max; if (EL0.Exact == EL1.Exact) BECount = EL0.Exact; } SCEVUnionPredicate NP; NP.add(&EL0.Pred); NP.add(&EL1.Pred); return ExitLimit(BECount, MaxBECount, NP); } } // With an icmp, it may be feasible to compute an exact backedge-taken count. // Proceed to the next level to examine the icmp. if (ICmpInst *ExitCondICmp = dyn_cast(ExitCond)) { ExitLimit EL = computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit); if (EL.hasFullInfo() || !AllowPredicates) return EL; // Try again, but use SCEV predicates this time. return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit, /*AllowPredicates=*/true); } // Check for a constant condition. These are normally stripped out by // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to // preserve the CFG and is temporarily leaving constant conditions // in place. if (ConstantInt *CI = dyn_cast(ExitCond)) { if (L->contains(FBB) == !CI->getZExtValue()) // The backedge is always taken. return getCouldNotCompute(); else // The backedge is never taken. return getZero(CI->getType()); } // If it's not an integer or pointer comparison then compute it the hard way. return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB)); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp(const Loop *L, ICmpInst *ExitCond, BasicBlock *TBB, BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) { // If the condition was exit on true, convert the condition to exit on false ICmpInst::Predicate Cond; if (!L->contains(FBB)) Cond = ExitCond->getPredicate(); else Cond = ExitCond->getInversePredicate(); // Handle common loops like: for (X = "string"; *X; ++X) if (LoadInst *LI = dyn_cast(ExitCond->getOperand(0))) if (Constant *RHS = dyn_cast(ExitCond->getOperand(1))) { ExitLimit ItCnt = computeLoadConstantCompareExitLimit(LI, RHS, L, Cond); if (ItCnt.hasAnyInfo()) return ItCnt; } const SCEV *LHS = getSCEV(ExitCond->getOperand(0)); const SCEV *RHS = getSCEV(ExitCond->getOperand(1)); // Try to evaluate any dependencies out of the loop. LHS = getSCEVAtScope(LHS, L); RHS = getSCEVAtScope(RHS, L); // At this point, we would like to compute how many iterations of the // loop the predicate will return true for these inputs. if (isLoopInvariant(LHS, L) && !isLoopInvariant(RHS, L)) { // If there is a loop-invariant, force it into the RHS. std::swap(LHS, RHS); Cond = ICmpInst::getSwappedPredicate(Cond); } // Simplify the operands before analyzing them. (void)SimplifyICmpOperands(Cond, LHS, RHS); // If we have a comparison of a chrec against a constant, try to use value // ranges to answer this query. if (const SCEVConstant *RHSC = dyn_cast(RHS)) if (const SCEVAddRecExpr *AddRec = dyn_cast(LHS)) if (AddRec->getLoop() == L) { // Form the constant range. ConstantRange CompRange( ICmpInst::makeConstantRange(Cond, RHSC->getAPInt())); const SCEV *Ret = AddRec->getNumIterationsInRange(CompRange, *this); if (!isa(Ret)) return Ret; } switch (Cond) { case ICmpInst::ICMP_NE: { // while (X != Y) // Convert to: while (X-Y != 0) ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_EQ: { // while (X == Y) // Convert to: while (X-Y == 0) ExitLimit EL = howFarToNonZero(getMinusSCEV(LHS, RHS), L); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_ULT: { // while (X < Y) bool IsSigned = Cond == ICmpInst::ICMP_SLT; ExitLimit EL = howManyLessThans(LHS, RHS, L, IsSigned, ControlsExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_UGT: { // while (X > Y) bool IsSigned = Cond == ICmpInst::ICMP_SGT; ExitLimit EL = howManyGreaterThans(LHS, RHS, L, IsSigned, ControlsExit, AllowPredicates); if (EL.hasAnyInfo()) return EL; break; } default: break; } auto *ExhaustiveCount = computeExitCountExhaustively(L, ExitCond, !L->contains(TBB)); if (!isa(ExhaustiveCount)) return ExhaustiveCount; return computeShiftCompareExitLimit(ExitCond->getOperand(0), ExitCond->getOperand(1), L, Cond); } ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromSingleExitSwitch(const Loop *L, SwitchInst *Switch, BasicBlock *ExitingBlock, bool ControlsExit) { assert(!L->contains(ExitingBlock) && "Not an exiting block!"); // Give up if the exit is the default dest of a switch. if (Switch->getDefaultDest() == ExitingBlock) return getCouldNotCompute(); assert(L->contains(Switch->getDefaultDest()) && "Default case must not exit the loop!"); const SCEV *LHS = getSCEVAtScope(Switch->getCondition(), L); const SCEV *RHS = getConstant(Switch->findCaseDest(ExitingBlock)); // while (X != Y) --> while (X-Y != 0) ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit); if (EL.hasAnyInfo()) return EL; return getCouldNotCompute(); } static ConstantInt * EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C, ScalarEvolution &SE) { const SCEV *InVal = SE.getConstant(C); const SCEV *Val = AddRec->evaluateAtIteration(InVal, SE); assert(isa(Val) && "Evaluation of SCEV at constant didn't fold correctly?"); return cast(Val)->getValue(); } /// Given an exit condition of 'icmp op load X, cst', try to see if we can /// compute the backedge execution count. ScalarEvolution::ExitLimit ScalarEvolution::computeLoadConstantCompareExitLimit( LoadInst *LI, Constant *RHS, const Loop *L, ICmpInst::Predicate predicate) { if (LI->isVolatile()) return getCouldNotCompute(); // Check to see if the loaded pointer is a getelementptr of a global. // TODO: Use SCEV instead of manually grubbing with GEPs. GetElementPtrInst *GEP = dyn_cast(LI->getOperand(0)); if (!GEP) return getCouldNotCompute(); // Make sure that it is really a constant global we are gepping, with an // initializer, and make sure the first IDX is really 0. GlobalVariable *GV = dyn_cast(GEP->getOperand(0)); if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() || GEP->getNumOperands() < 3 || !isa(GEP->getOperand(1)) || !cast(GEP->getOperand(1))->isNullValue()) return getCouldNotCompute(); // Okay, we allow one non-constant index into the GEP instruction. Value *VarIdx = nullptr; std::vector Indexes; unsigned VarIdxNum = 0; for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i) if (ConstantInt *CI = dyn_cast(GEP->getOperand(i))) { Indexes.push_back(CI); } else if (!isa(GEP->getOperand(i))) { if (VarIdx) return getCouldNotCompute(); // Multiple non-constant idx's. VarIdx = GEP->getOperand(i); VarIdxNum = i-2; Indexes.push_back(nullptr); } // Loop-invariant loads may be a byproduct of loop optimization. Skip them. if (!VarIdx) return getCouldNotCompute(); // Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant. // Check to see if X is a loop variant variable value now. const SCEV *Idx = getSCEV(VarIdx); Idx = getSCEVAtScope(Idx, L); // We can only recognize very limited forms of loop index expressions, in // particular, only affine AddRec's like {C1,+,C2}. const SCEVAddRecExpr *IdxExpr = dyn_cast(Idx); if (!IdxExpr || !IdxExpr->isAffine() || isLoopInvariant(IdxExpr, L) || !isa(IdxExpr->getOperand(0)) || !isa(IdxExpr->getOperand(1))) return getCouldNotCompute(); unsigned MaxSteps = MaxBruteForceIterations; for (unsigned IterationNum = 0; IterationNum != MaxSteps; ++IterationNum) { ConstantInt *ItCst = ConstantInt::get( cast(IdxExpr->getType()), IterationNum); ConstantInt *Val = EvaluateConstantChrecAtConstant(IdxExpr, ItCst, *this); // Form the GEP offset. Indexes[VarIdxNum] = Val; Constant *Result = ConstantFoldLoadThroughGEPIndices(GV->getInitializer(), Indexes); if (!Result) break; // Cannot compute! // Evaluate the condition for this iteration. Result = ConstantExpr::getICmp(predicate, Result, RHS); if (!isa(Result)) break; // Couldn't decide for sure if (cast(Result)->getValue().isMinValue()) { ++NumArrayLenItCounts; return getConstant(ItCst); // Found terminating iteration! } } return getCouldNotCompute(); } ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit( Value *LHS, Value *RHSV, const Loop *L, ICmpInst::Predicate Pred) { ConstantInt *RHS = dyn_cast(RHSV); if (!RHS) return getCouldNotCompute(); const BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return getCouldNotCompute(); const BasicBlock *Predecessor = L->getLoopPredecessor(); if (!Predecessor) return getCouldNotCompute(); // Return true if V is of the form "LHS `shift_op` ". // Return LHS in OutLHS and shift_opt in OutOpCode. auto MatchPositiveShift = [](Value *V, Value *&OutLHS, Instruction::BinaryOps &OutOpCode) { using namespace PatternMatch; ConstantInt *ShiftAmt; if (match(V, m_LShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) OutOpCode = Instruction::LShr; else if (match(V, m_AShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) OutOpCode = Instruction::AShr; else if (match(V, m_Shl(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) OutOpCode = Instruction::Shl; else return false; return ShiftAmt->getValue().isStrictlyPositive(); }; // Recognize a "shift recurrence" either of the form %iv or of %iv.shifted in // // loop: // %iv = phi i32 [ %iv.shifted, %loop ], [ %val, %preheader ] // %iv.shifted = lshr i32 %iv, // // Return true on a succesful match. Return the corresponding PHI node (%iv // above) in PNOut and the opcode of the shift operation in OpCodeOut. auto MatchShiftRecurrence = [&](Value *V, PHINode *&PNOut, Instruction::BinaryOps &OpCodeOut) { Optional PostShiftOpCode; { Instruction::BinaryOps OpC; Value *V; // If we encounter a shift instruction, "peel off" the shift operation, // and remember that we did so. Later when we inspect %iv's backedge // value, we will make sure that the backedge value uses the same // operation. // // Note: the peeled shift operation does not have to be the same // instruction as the one feeding into the PHI's backedge value. We only // really care about it being the same *kind* of shift instruction -- // that's all that is required for our later inferences to hold. if (MatchPositiveShift(LHS, V, OpC)) { PostShiftOpCode = OpC; LHS = V; } } PNOut = dyn_cast(LHS); if (!PNOut || PNOut->getParent() != L->getHeader()) return false; Value *BEValue = PNOut->getIncomingValueForBlock(Latch); Value *OpLHS; return // The backedge value for the PHI node must be a shift by a positive // amount MatchPositiveShift(BEValue, OpLHS, OpCodeOut) && // of the PHI node itself OpLHS == PNOut && // and the kind of shift should be match the kind of shift we peeled // off, if any. (!PostShiftOpCode.hasValue() || *PostShiftOpCode == OpCodeOut); }; PHINode *PN; Instruction::BinaryOps OpCode; if (!MatchShiftRecurrence(LHS, PN, OpCode)) return getCouldNotCompute(); const DataLayout &DL = getDataLayout(); // The key rationale for this optimization is that for some kinds of shift // recurrences, the value of the recurrence "stabilizes" to either 0 or -1 // within a finite number of iterations. If the condition guarding the // backedge (in the sense that the backedge is taken if the condition is true) // is false for the value the shift recurrence stabilizes to, then we know // that the backedge is taken only a finite number of times. ConstantInt *StableValue = nullptr; switch (OpCode) { default: llvm_unreachable("Impossible case!"); case Instruction::AShr: { // {K,ashr,} stabilizes to signum(K) in at most // bitwidth(K) iterations. Value *FirstValue = PN->getIncomingValueForBlock(Predecessor); bool KnownZero, KnownOne; ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr, Predecessor->getTerminator(), &DT); auto *Ty = cast(RHS->getType()); if (KnownZero) StableValue = ConstantInt::get(Ty, 0); else if (KnownOne) StableValue = ConstantInt::get(Ty, -1, true); else return getCouldNotCompute(); break; } case Instruction::LShr: case Instruction::Shl: // Both {K,lshr,} and {K,shl,} // stabilize to 0 in at most bitwidth(K) iterations. StableValue = ConstantInt::get(cast(RHS->getType()), 0); break; } auto *Result = ConstantFoldCompareInstOperands(Pred, StableValue, RHS, DL, &TLI); assert(Result->getType()->isIntegerTy(1) && "Otherwise cannot be an operand to a branch instruction"); if (Result->isZeroValue()) { unsigned BitWidth = getTypeSizeInBits(RHS->getType()); const SCEV *UpperBound = getConstant(getEffectiveSCEVType(RHS->getType()), BitWidth); SCEVUnionPredicate P; return ExitLimit(getCouldNotCompute(), UpperBound, P); } return getCouldNotCompute(); } /// Return true if we can constant fold an instruction of the specified type, /// assuming that all operands were constants. static bool CanConstantFold(const Instruction *I) { if (isa(I) || isa(I) || isa(I) || isa(I) || isa(I) || isa(I)) return true; if (const CallInst *CI = dyn_cast(I)) if (const Function *F = CI->getCalledFunction()) return canConstantFoldCallTo(F); return false; } /// Determine whether this instruction can constant evolve within this loop /// assuming its operands can all constant evolve. static bool canConstantEvolve(Instruction *I, const Loop *L) { // An instruction outside of the loop can't be derived from a loop PHI. if (!L->contains(I)) return false; if (isa(I)) { // We don't currently keep track of the control flow needed to evaluate // PHIs, so we cannot handle PHIs inside of loops. return L->getHeader() == I->getParent(); } // If we won't be able to constant fold this expression even if the operands // are constants, bail early. return CanConstantFold(I); } /// getConstantEvolvingPHIOperands - Implement getConstantEvolvingPHI by /// recursing through each instruction operand until reaching a loop header phi. static PHINode * getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L, DenseMap &PHIMap) { // Otherwise, we can evaluate this instruction if all of its operands are // constant or derived from a PHI node themselves. PHINode *PHI = nullptr; for (Value *Op : UseInst->operands()) { if (isa(Op)) continue; Instruction *OpInst = dyn_cast(Op); if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr; PHINode *P = dyn_cast(OpInst); if (!P) // If this operand is already visited, reuse the prior result. // We may have P != PHI if this is the deepest point at which the // inconsistent paths meet. P = PHIMap.lookup(OpInst); if (!P) { // Recurse and memoize the results, whether a phi is found or not. // This recursive call invalidates pointers into PHIMap. P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap); PHIMap[OpInst] = P; } if (!P) return nullptr; // Not evolving from PHI if (PHI && PHI != P) return nullptr; // Evolving from multiple different PHIs. PHI = P; } // This is a expression evolving from a constant PHI! return PHI; } /// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node /// in the loop that V is derived from. We allow arbitrary operations along the /// way, but the operands of an operation must either be constants or a value /// derived from a constant PHI. If this expression does not fit with these /// constraints, return null. static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) { Instruction *I = dyn_cast(V); if (!I || !canConstantEvolve(I, L)) return nullptr; if (PHINode *PN = dyn_cast(I)) return PN; // Record non-constant instructions contained by the loop. DenseMap PHIMap; return getConstantEvolvingPHIOperands(I, L, PHIMap); } /// EvaluateExpression - Given an expression that passes the /// getConstantEvolvingPHI predicate, evaluate its value assuming the PHI node /// in the loop has the value PHIVal. If we can't fold this expression for some /// reason, return null. static Constant *EvaluateExpression(Value *V, const Loop *L, DenseMap &Vals, const DataLayout &DL, const TargetLibraryInfo *TLI) { // Convenient constant check, but redundant for recursive calls. if (Constant *C = dyn_cast(V)) return C; Instruction *I = dyn_cast(V); if (!I) return nullptr; if (Constant *C = Vals.lookup(I)) return C; // An instruction inside the loop depends on a value outside the loop that we // weren't given a mapping for, or a value such as a call inside the loop. if (!canConstantEvolve(I, L)) return nullptr; // An unmapped PHI can be due to a branch or another loop inside this loop, // or due to this not being the initial iteration through a loop where we // couldn't compute the evolution of this particular PHI last time. if (isa(I)) return nullptr; std::vector Operands(I->getNumOperands()); for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Instruction *Operand = dyn_cast(I->getOperand(i)); if (!Operand) { Operands[i] = dyn_cast(I->getOperand(i)); if (!Operands[i]) return nullptr; continue; } Constant *C = EvaluateExpression(Operand, L, Vals, DL, TLI); Vals[Operand] = C; if (!C) return nullptr; Operands[i] = C; } if (CmpInst *CI = dyn_cast(I)) return ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0], Operands[1], DL, TLI); if (LoadInst *LI = dyn_cast(I)) { if (!LI->isVolatile()) return ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL); } return ConstantFoldInstOperands(I, Operands, DL, TLI); } // If every incoming value to PN except the one for BB is a specific Constant, // return that, else return nullptr. static Constant *getOtherIncomingValue(PHINode *PN, BasicBlock *BB) { Constant *IncomingVal = nullptr; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { if (PN->getIncomingBlock(i) == BB) continue; auto *CurrentVal = dyn_cast(PN->getIncomingValue(i)); if (!CurrentVal) return nullptr; if (IncomingVal != CurrentVal) { if (IncomingVal) return nullptr; IncomingVal = CurrentVal; } } return IncomingVal; } /// getConstantEvolutionLoopExitValue - If we know that the specified Phi is /// in the header of its containing loop, we know the loop executes a /// constant number of times, and the PHI node is just a recurrence /// involving constants, fold it. Constant * ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, const APInt &BEs, const Loop *L) { auto I = ConstantEvolutionLoopExitValue.find(PN); if (I != ConstantEvolutionLoopExitValue.end()) return I->second; if (BEs.ugt(MaxBruteForceIterations)) return ConstantEvolutionLoopExitValue[PN] = nullptr; // Not going to evaluate it. Constant *&RetVal = ConstantEvolutionLoopExitValue[PN]; DenseMap CurrentIterVals; BasicBlock *Header = L->getHeader(); assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!"); BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return nullptr; for (auto &I : *Header) { PHINode *PHI = dyn_cast(&I); if (!PHI) break; auto *StartCST = getOtherIncomingValue(PHI, Latch); if (!StartCST) continue; CurrentIterVals[PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return RetVal = nullptr; Value *BEValue = PN->getIncomingValueForBlock(Latch); // Execute the loop symbolically to determine the exit value. if (BEs.getActiveBits() >= 32) return RetVal = nullptr; // More than 2^32-1 iterations?? Not doing it! unsigned NumIterations = BEs.getZExtValue(); // must be in range unsigned IterationNum = 0; const DataLayout &DL = getDataLayout(); for (; ; ++IterationNum) { if (IterationNum == NumIterations) return RetVal = CurrentIterVals[PN]; // Got exit value! // Compute the value of the PHIs for the next iteration. // EvaluateExpression adds non-phi values to the CurrentIterVals map. DenseMap NextIterVals; Constant *NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); if (!NextPHI) return nullptr; // Couldn't evaluate! NextIterVals[PN] = NextPHI; bool StoppedEvolving = NextPHI == CurrentIterVals[PN]; // Also evaluate the other PHI nodes. However, we don't get to stop if we // cease to be able to evaluate one of them or if they stop evolving, // because that doesn't necessarily prevent us from computing PN. SmallVector, 8> PHIsToCompute; for (const auto &I : CurrentIterVals) { PHINode *PHI = dyn_cast(I.first); if (!PHI || PHI == PN || PHI->getParent() != Header) continue; PHIsToCompute.emplace_back(PHI, I.second); } // We use two distinct loops because EvaluateExpression may invalidate any // iterators into CurrentIterVals. for (const auto &I : PHIsToCompute) { PHINode *PHI = I.first; Constant *&NextPHI = NextIterVals[PHI]; if (!NextPHI) { // Not already computed. Value *BEValue = PHI->getIncomingValueForBlock(Latch); NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); } if (NextPHI != I.second) StoppedEvolving = false; } // If all entries in CurrentIterVals == NextIterVals then we can stop // iterating, the loop can't continue to change. if (StoppedEvolving) return RetVal = CurrentIterVals[PN]; CurrentIterVals.swap(NextIterVals); } } const SCEV *ScalarEvolution::computeExitCountExhaustively(const Loop *L, Value *Cond, bool ExitWhen) { PHINode *PN = getConstantEvolvingPHI(Cond, L); if (!PN) return getCouldNotCompute(); // If the loop is canonicalized, the PHI will have exactly two entries. // That's the only form we support here. if (PN->getNumIncomingValues() != 2) return getCouldNotCompute(); DenseMap CurrentIterVals; BasicBlock *Header = L->getHeader(); assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!"); BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Should follow from NumIncomingValues == 2!"); for (auto &I : *Header) { PHINode *PHI = dyn_cast(&I); if (!PHI) break; auto *StartCST = getOtherIncomingValue(PHI, Latch); if (!StartCST) continue; CurrentIterVals[PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return getCouldNotCompute(); // Okay, we find a PHI node that defines the trip count of this loop. Execute // the loop symbolically to determine when the condition gets a value of // "ExitWhen". unsigned MaxIterations = MaxBruteForceIterations; // Limit analysis. const DataLayout &DL = getDataLayout(); for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){ auto *CondVal = dyn_cast_or_null( EvaluateExpression(Cond, L, CurrentIterVals, DL, &TLI)); // Couldn't symbolically evaluate. if (!CondVal) return getCouldNotCompute(); if (CondVal->getValue() == uint64_t(ExitWhen)) { ++NumBruteForceTripCountsComputed; return getConstant(Type::getInt32Ty(getContext()), IterationNum); } // Update all the PHI nodes for the next iteration. DenseMap NextIterVals; // Create a list of which PHIs we need to compute. We want to do this before // calling EvaluateExpression on them because that may invalidate iterators // into CurrentIterVals. SmallVector PHIsToCompute; for (const auto &I : CurrentIterVals) { PHINode *PHI = dyn_cast(I.first); if (!PHI || PHI->getParent() != Header) continue; PHIsToCompute.push_back(PHI); } for (PHINode *PHI : PHIsToCompute) { Constant *&NextPHI = NextIterVals[PHI]; if (NextPHI) continue; // Already computed! Value *BEValue = PHI->getIncomingValueForBlock(Latch); NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); } CurrentIterVals.swap(NextIterVals); } // Too many iterations were needed to evaluate. return getCouldNotCompute(); } const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) { SmallVector, 2> &Values = ValuesAtScopes[V]; // Check to see if we've folded this expression at this loop before. for (auto &LS : Values) if (LS.first == L) return LS.second ? LS.second : V; Values.emplace_back(L, nullptr); // Otherwise compute it. const SCEV *C = computeSCEVAtScope(V, L); for (auto &LS : reverse(ValuesAtScopes[V])) if (LS.first == L) { LS.second = C; break; } return C; } /// This builds up a Constant using the ConstantExpr interface. That way, we /// will return Constants for objects which aren't represented by a /// SCEVConstant, because SCEVConstant is restricted to ConstantInt. /// Returns NULL if the SCEV isn't representable as a Constant. static Constant *BuildConstantFromSCEV(const SCEV *V) { switch (static_cast(V->getSCEVType())) { case scCouldNotCompute: case scAddRecExpr: break; case scConstant: return cast(V)->getValue(); case scUnknown: return dyn_cast(cast(V)->getValue()); case scSignExtend: { const SCEVSignExtendExpr *SS = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(SS->getOperand())) return ConstantExpr::getSExt(CastOp, SS->getType()); break; } case scZeroExtend: { const SCEVZeroExtendExpr *SZ = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(SZ->getOperand())) return ConstantExpr::getZExt(CastOp, SZ->getType()); break; } case scTruncate: { const SCEVTruncateExpr *ST = cast(V); if (Constant *CastOp = BuildConstantFromSCEV(ST->getOperand())) return ConstantExpr::getTrunc(CastOp, ST->getType()); break; } case scAddExpr: { const SCEVAddExpr *SA = cast(V); if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) { if (PointerType *PTy = dyn_cast(C->getType())) { unsigned AS = PTy->getAddressSpace(); Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS); C = ConstantExpr::getBitCast(C, DestPtrTy); } for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) { Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i)); if (!C2) return nullptr; // First pointer! if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) { unsigned AS = C2->getType()->getPointerAddressSpace(); std::swap(C, C2); Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS); // The offsets have been converted to bytes. We can add bytes to an // i8* by GEP with the byte count in the first index. C = ConstantExpr::getBitCast(C, DestPtrTy); } // Don't bother trying to sum two pointers. We probably can't // statically compute a load that results from it anyway. if (C2->getType()->isPointerTy()) return nullptr; if (PointerType *PTy = dyn_cast(C->getType())) { if (PTy->getElementType()->isStructTy()) C2 = ConstantExpr::getIntegerCast( C2, Type::getInt32Ty(C->getContext()), true); C = ConstantExpr::getGetElementPtr(PTy->getElementType(), C, C2); } else C = ConstantExpr::getAdd(C, C2); } return C; } break; } case scMulExpr: { const SCEVMulExpr *SM = cast(V); if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) { // Don't bother with pointers at all. if (C->getType()->isPointerTy()) return nullptr; for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) { Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i)); if (!C2 || C2->getType()->isPointerTy()) return nullptr; C = ConstantExpr::getMul(C, C2); } return C; } break; } case scUDivExpr: { const SCEVUDivExpr *SU = cast(V); if (Constant *LHS = BuildConstantFromSCEV(SU->getLHS())) if (Constant *RHS = BuildConstantFromSCEV(SU->getRHS())) if (LHS->getType() == RHS->getType()) return ConstantExpr::getUDiv(LHS, RHS); break; } case scSMaxExpr: case scUMaxExpr: break; // TODO: smax, umax. } return nullptr; } const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { if (isa(V)) return V; // If this instruction is evolved from a constant-evolving PHI, compute the // exit value from the loop without using SCEVs. if (const SCEVUnknown *SU = dyn_cast(V)) { if (Instruction *I = dyn_cast(SU->getValue())) { const Loop *LI = this->LI[I->getParent()]; if (LI && LI->getParentLoop() == L) // Looking for loop exit value. if (PHINode *PN = dyn_cast(I)) if (PN->getParent() == LI->getHeader()) { // Okay, there is no closed form solution for the PHI node. Check // to see if the loop that contains it has a known backedge-taken // count. If so, we may be able to force computation of the exit // value. const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI); if (const SCEVConstant *BTCC = dyn_cast(BackedgeTakenCount)) { // Okay, we know how many times the containing loop executes. If // this is a constant evolving PHI node, get the final value at // the specified iteration number. Constant *RV = getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), LI); if (RV) return getSCEV(RV); } } // Okay, this is an expression that we cannot symbolically evaluate // into a SCEV. Check to see if it's possible to symbolically evaluate // the arguments into constants, and if so, try to constant propagate the // result. This is particularly useful for computing loop exit values. if (CanConstantFold(I)) { SmallVector Operands; bool MadeImprovement = false; for (Value *Op : I->operands()) { if (Constant *C = dyn_cast(Op)) { Operands.push_back(C); continue; } // If any of the operands is non-constant and if they are // non-integer and non-pointer, don't even try to analyze them // with scev techniques. if (!isSCEVable(Op->getType())) return V; const SCEV *OrigV = getSCEV(Op); const SCEV *OpV = getSCEVAtScope(OrigV, L); MadeImprovement |= OrigV != OpV; Constant *C = BuildConstantFromSCEV(OpV); if (!C) return V; if (C->getType() != Op->getType()) C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, Op->getType(), false), C, Op->getType()); Operands.push_back(C); } // Check to see if getSCEVAtScope actually made an improvement. if (MadeImprovement) { Constant *C = nullptr; const DataLayout &DL = getDataLayout(); if (const CmpInst *CI = dyn_cast(I)) C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0], Operands[1], DL, &TLI); else if (const LoadInst *LI = dyn_cast(I)) { if (!LI->isVolatile()) C = ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL); } else C = ConstantFoldInstOperands(I, Operands, DL, &TLI); if (!C) return V; return getSCEV(C); } } } // This is some other type of SCEVUnknown, just return it. return V; } if (const SCEVCommutativeExpr *Comm = dyn_cast(V)) { // Avoid performing the look-up in the common case where the specified // expression has no loop-variant portions. for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) { const SCEV *OpAtScope = getSCEVAtScope(Comm->getOperand(i), L); if (OpAtScope != Comm->getOperand(i)) { // Okay, at least one of these operands is loop variant but might be // foldable. Build a new instance of the folded commutative expression. SmallVector NewOps(Comm->op_begin(), Comm->op_begin()+i); NewOps.push_back(OpAtScope); for (++i; i != e; ++i) { OpAtScope = getSCEVAtScope(Comm->getOperand(i), L); NewOps.push_back(OpAtScope); } if (isa(Comm)) return getAddExpr(NewOps); if (isa(Comm)) return getMulExpr(NewOps); if (isa(Comm)) return getSMaxExpr(NewOps); if (isa(Comm)) return getUMaxExpr(NewOps); llvm_unreachable("Unknown commutative SCEV type!"); } } // If we got here, all operands are loop invariant. return Comm; } if (const SCEVUDivExpr *Div = dyn_cast(V)) { const SCEV *LHS = getSCEVAtScope(Div->getLHS(), L); const SCEV *RHS = getSCEVAtScope(Div->getRHS(), L); if (LHS == Div->getLHS() && RHS == Div->getRHS()) return Div; // must be loop invariant return getUDivExpr(LHS, RHS); } // If this is a loop recurrence for a loop that does not contain L, then we // are dealing with the final value computed by the loop. if (const SCEVAddRecExpr *AddRec = dyn_cast(V)) { // First, attempt to evaluate each operand. // Avoid performing the look-up in the common case where the specified // expression has no loop-variant portions. for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) { const SCEV *OpAtScope = getSCEVAtScope(AddRec->getOperand(i), L); if (OpAtScope == AddRec->getOperand(i)) continue; // Okay, at least one of these operands is loop variant but might be // foldable. Build a new instance of the folded commutative expression. SmallVector NewOps(AddRec->op_begin(), AddRec->op_begin()+i); NewOps.push_back(OpAtScope); for (++i; i != e; ++i) NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L)); const SCEV *FoldedRec = getAddRecExpr(NewOps, AddRec->getLoop(), AddRec->getNoWrapFlags(SCEV::FlagNW)); AddRec = dyn_cast(FoldedRec); // The addrec may be folded to a nonrecurrence, for example, if the // induction variable is multiplied by zero after constant folding. Go // ahead and return the folded value. if (!AddRec) return FoldedRec; break; } // If the scope is outside the addrec's loop, evaluate it by using the // loop exit value of the addrec. if (!AddRec->getLoop()->contains(L)) { // To evaluate this recurrence, we need to know how many times the AddRec // loop iterates. Compute this now. const SCEV *BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop()); if (BackedgeTakenCount == getCouldNotCompute()) return AddRec; // Then, evaluate the AddRec. return AddRec->evaluateAtIteration(BackedgeTakenCount, *this); } return AddRec; } if (const SCEVZeroExtendExpr *Cast = dyn_cast(V)) { const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); if (Op == Cast->getOperand()) return Cast; // must be loop invariant return getZeroExtendExpr(Op, Cast->getType()); } if (const SCEVSignExtendExpr *Cast = dyn_cast(V)) { const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); if (Op == Cast->getOperand()) return Cast; // must be loop invariant return getSignExtendExpr(Op, Cast->getType()); } if (const SCEVTruncateExpr *Cast = dyn_cast(V)) { const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); if (Op == Cast->getOperand()) return Cast; // must be loop invariant return getTruncateExpr(Op, Cast->getType()); } llvm_unreachable("Unknown SCEV type!"); } const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) { return getSCEVAtScope(getSCEV(V), L); } /// Finds the minimum unsigned root of the following equation: /// /// A * X = B (mod N) /// /// where N = 2^BW and BW is the common bit width of A and B. The signedness of /// A and B isn't important. /// /// If the equation does not have a solution, SCEVCouldNotCompute is returned. static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B, ScalarEvolution &SE) { uint32_t BW = A.getBitWidth(); assert(BW == B.getBitWidth() && "Bit widths must be the same."); assert(A != 0 && "A must be non-zero."); // 1. D = gcd(A, N) // // The gcd of A and N may have only one prime factor: 2. The number of // trailing zeros in A is its multiplicity uint32_t Mult2 = A.countTrailingZeros(); // D = 2^Mult2 // 2. Check if B is divisible by D. // // B is divisible by D if and only if the multiplicity of prime factor 2 for B // is not less than multiplicity of this prime factor for D. if (B.countTrailingZeros() < Mult2) return SE.getCouldNotCompute(); // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic // modulo (N / D). // // (N / D) may need BW+1 bits in its representation. Hence, we'll use this // bit width during computations. APInt AD = A.lshr(Mult2).zext(BW + 1); // AD = A / D APInt Mod(BW + 1, 0); Mod.setBit(BW - Mult2); // Mod = N / D APInt I = AD.multiplicativeInverse(Mod); // 4. Compute the minimum unsigned root of the equation: // I * (B / D) mod (N / D) APInt Result = (I * B.lshr(Mult2).zext(BW + 1)).urem(Mod); // The result is guaranteed to be less than 2^BW so we may truncate it to BW // bits. return SE.getConstant(Result.trunc(BW)); } /// Find the roots of the quadratic equation for the given quadratic chrec /// {L,+,M,+,N}. This returns either the two roots (which might be the same) or /// two SCEVCouldNotCompute objects. /// static Optional> SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) { assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!"); const SCEVConstant *LC = dyn_cast(AddRec->getOperand(0)); const SCEVConstant *MC = dyn_cast(AddRec->getOperand(1)); const SCEVConstant *NC = dyn_cast(AddRec->getOperand(2)); // We currently can only solve this if the coefficients are constants. if (!LC || !MC || !NC) return None; uint32_t BitWidth = LC->getAPInt().getBitWidth(); const APInt &L = LC->getAPInt(); const APInt &M = MC->getAPInt(); const APInt &N = NC->getAPInt(); APInt Two(BitWidth, 2); APInt Four(BitWidth, 4); { using namespace APIntOps; const APInt& C = L; // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C // The B coefficient is M-N/2 APInt B(M); B -= sdiv(N,Two); // The A coefficient is N/2 APInt A(N.sdiv(Two)); // Compute the B^2-4ac term. APInt SqrtTerm(B); SqrtTerm *= B; SqrtTerm -= Four * (A * C); if (SqrtTerm.isNegative()) { // The loop is provably infinite. return None; } // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest // integer value or else APInt::sqrt() will assert. APInt SqrtVal(SqrtTerm.sqrt()); // Compute the two solutions for the quadratic formula. // The divisions must be performed as signed divisions. APInt NegB(-B); APInt TwoA(A << 1); if (TwoA.isMinValue()) return None; LLVMContext &Context = SE.getContext(); ConstantInt *Solution1 = ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA)); ConstantInt *Solution2 = ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA)); return std::make_pair(cast(SE.getConstant(Solution1)), cast(SE.getConstant(Solution2))); } // end APIntOps namespace } ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit, bool AllowPredicates) { // This is only used for loops with a "x != y" exit test. The exit condition // is now expressed as a single expression, V = x-y. So the exit test is // effectively V != 0. We know and take advantage of the fact that this // expression only being used in a comparison by zero context. SCEVUnionPredicate P; // If the value is a constant if (const SCEVConstant *C = dyn_cast(V)) { // If the value is already zero, the branch will execute zero times. if (C->getValue()->isZero()) return C; return getCouldNotCompute(); // Otherwise it will loop infinitely. } const SCEVAddRecExpr *AddRec = dyn_cast(V); if (!AddRec && AllowPredicates) // Try to make this an AddRec using runtime tests, in the first X // iterations of this loop, where X is the SCEV expression found by the // algorithm below. AddRec = convertSCEVToAddRecWithPredicates(V, L, P); if (!AddRec || AddRec->getLoop() != L) return getCouldNotCompute(); // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of // the quadratic equation to solve it. if (AddRec->isQuadratic() && AddRec->getType()->isIntegerTy()) { if (auto Roots = SolveQuadraticEquation(AddRec, *this)) { const SCEVConstant *R1 = Roots->first; const SCEVConstant *R2 = Roots->second; // Pick the smallest positive root value. if (ConstantInt *CB = dyn_cast(ConstantExpr::getICmp( CmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) { if (!CB->getZExtValue()) std::swap(R1, R2); // R1 is the minimum root now. // We can only use this value if the chrec ends up with an exact zero // value at this index. When solving for "X*X != 5", for example, we // should not accept a root of 2. const SCEV *Val = AddRec->evaluateAtIteration(R1, *this); if (Val->isZero()) return ExitLimit(R1, R1, P); // We found a quadratic root! } } return getCouldNotCompute(); } // Otherwise we can only handle this if it is affine. if (!AddRec->isAffine()) return getCouldNotCompute(); // If this is an affine expression, the execution count of this branch is // the minimum unsigned root of the following equation: // // Start + Step*N = 0 (mod 2^BW) // // equivalent to: // // Step*N = -Start (mod 2^BW) // // where BW is the common bit width of Start and Step. // Get the initial value for the loop. const SCEV *Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop()); const SCEV *Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop()); // For now we handle only constant steps. // // TODO: Handle a nonconstant Step given AddRec. If the // AddRec is NUW, then (in an unsigned sense) it cannot be counting up to wrap // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step. // We have not yet seen any such cases. const SCEVConstant *StepC = dyn_cast(Step); if (!StepC || StepC->getValue()->equalsInt(0)) return getCouldNotCompute(); // For positive steps (counting up until unsigned overflow): // N = -Start/Step (as unsigned) // For negative steps (counting down to zero): // N = Start/-Step // First compute the unsigned distance from zero in the direction of Step. bool CountDown = StepC->getAPInt().isNegative(); const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start); // Handle unitary steps, which cannot wraparound. // 1*N = -Start; -1*N = Start (mod 2^BW), so: // N = Distance (as unsigned) if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) { ConstantRange CR = getUnsignedRange(Start); const SCEV *MaxBECount; if (!CountDown && CR.getUnsignedMin().isMinValue()) // When counting up, the worst starting value is 1, not 0. MaxBECount = CR.getUnsignedMax().isMinValue() ? getConstant(APInt::getMinValue(CR.getBitWidth())) : getConstant(APInt::getMaxValue(CR.getBitWidth())); else MaxBECount = getConstant(CountDown ? CR.getUnsignedMax() : -CR.getUnsignedMin()); return ExitLimit(Distance, MaxBECount, P); } // As a special case, handle the instance where Step is a positive power of // two. In this case, determining whether Step divides Distance evenly can be // done by counting and comparing the number of trailing zeros of Step and // Distance. if (!CountDown) { const APInt &StepV = StepC->getAPInt(); // StepV.isPowerOf2() returns true if StepV is an positive power of two. It // also returns true if StepV is maximally negative (eg, INT_MIN), but that // case is not handled as this code is guarded by !CountDown. if (StepV.isPowerOf2() && GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) { // Here we've constrained the equation to be of the form // // 2^(N + k) * Distance' = (StepV == 2^N) * X (mod 2^W) ... (0) // // where we're operating on a W bit wide integer domain and k is // non-negative. The smallest unsigned solution for X is the trip count. // // (0) is equivalent to: // // 2^(N + k) * Distance' - 2^N * X = L * 2^W // <=> 2^N(2^k * Distance' - X) = L * 2^(W - N) * 2^N // <=> 2^k * Distance' - X = L * 2^(W - N) // <=> 2^k * Distance' = L * 2^(W - N) + X ... (1) // // The smallest X satisfying (1) is unsigned remainder of dividing the LHS // by 2^(W - N). // // <=> X = 2^k * Distance' URem 2^(W - N) ... (2) // // E.g. say we're solving // // 2 * Val = 2 * X (in i8) ... (3) // // then from (2), we get X = Val URem i8 128 (k = 0 in this case). // // Note: It is tempting to solve (3) by setting X = Val, but Val is not // necessarily the smallest unsigned value of X that satisfies (3). // E.g. if Val is i8 -127 then the smallest value of X that satisfies (3) // is i8 1, not i8 -127 const auto *ModuloResult = getUDivExactExpr(Distance, Step); // Since SCEV does not have a URem node, we construct one using a truncate // and a zero extend. unsigned NarrowWidth = StepV.getBitWidth() - StepV.countTrailingZeros(); auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth); auto *WideTy = Distance->getType(); const SCEV *Limit = getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy); return ExitLimit(Limit, Limit, P); } } // If the condition controls loop exit (the loop exits only if the expression // is true) and the addition is no-wrap we can use unsigned divide to // compute the backedge count. In this case, the step may not divide the // distance, but we don't care because if the condition is "missed" the loop // will have undefined behavior due to wrapping. if (ControlsExit && AddRec->hasNoSelfWrap() && loopHasNoAbnormalExits(AddRec->getLoop())) { const SCEV *Exact = getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step); return ExitLimit(Exact, Exact, P); } // Then, try to solve the above equation provided that Start is constant. if (const SCEVConstant *StartC = dyn_cast(Start)) { const SCEV *E = SolveLinEquationWithOverflow( StepC->getValue()->getValue(), -StartC->getValue()->getValue(), *this); return ExitLimit(E, E, P); } return getCouldNotCompute(); } ScalarEvolution::ExitLimit ScalarEvolution::howFarToNonZero(const SCEV *V, const Loop *L) { // Loops that look like: while (X == 0) are very strange indeed. We don't // handle them yet except for the trivial case. This could be expanded in the // future as needed. // If the value is a constant, check to see if it is known to be non-zero // already. If so, the backedge will execute zero times. if (const SCEVConstant *C = dyn_cast(V)) { if (!C->getValue()->isNullValue()) return getZero(C->getType()); return getCouldNotCompute(); // Otherwise it will loop infinitely. } // We could implement others, but I really doubt anyone writes loops like // this, and if they did, they would already be constant folded. return getCouldNotCompute(); } std::pair ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) { // If the block has a unique predecessor, then there is no path from the // predecessor to the block that does not go through the direct edge // from the predecessor to the block. if (BasicBlock *Pred = BB->getSinglePredecessor()) return {Pred, BB}; // A loop's header is defined to be a block that dominates the loop. // If the header has a unique predecessor outside the loop, it must be // a block that has exactly one successor that can reach the loop. if (Loop *L = LI.getLoopFor(BB)) return {L->getLoopPredecessor(), L->getHeader()}; return {nullptr, nullptr}; } /// SCEV structural equivalence is usually sufficient for testing whether two /// expressions are equal, however for the purposes of looking for a condition /// guarding a loop, it can be useful to be a little more general, since a /// front-end may have replicated the controlling expression. /// static bool HasSameValue(const SCEV *A, const SCEV *B) { // Quick check to see if they are the same SCEV. if (A == B) return true; auto ComputesEqualValues = [](const Instruction *A, const Instruction *B) { // Not all instructions that are "identical" compute the same value. For // instance, two distinct alloca instructions allocating the same type are // identical and do not read memory; but compute distinct values. return A->isIdenticalTo(B) && (isa(A) || isa(A)); }; // Otherwise, if they're both SCEVUnknown, it's possible that they hold // two different instructions with the same value. Check for this case. if (const SCEVUnknown *AU = dyn_cast(A)) if (const SCEVUnknown *BU = dyn_cast(B)) if (const Instruction *AI = dyn_cast(AU->getValue())) if (const Instruction *BI = dyn_cast(BU->getValue())) if (ComputesEqualValues(AI, BI)) return true; // Otherwise assume they may have a different value. return false; } bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, const SCEV *&LHS, const SCEV *&RHS, unsigned Depth) { bool Changed = false; // If we hit the max recursion limit bail out. if (Depth >= 3) return false; // Canonicalize a constant to the right side. if (const SCEVConstant *LHSC = dyn_cast(LHS)) { // Check for both operands constant. if (const SCEVConstant *RHSC = dyn_cast(RHS)) { if (ConstantExpr::getICmp(Pred, LHSC->getValue(), RHSC->getValue())->isNullValue()) goto trivially_false; else goto trivially_true; } // Otherwise swap the operands to put the constant on the right. std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); Changed = true; } // If we're comparing an addrec with a value which is loop-invariant in the // addrec's loop, put the addrec on the left. Also make a dominance check, // as both operands could be addrecs loop-invariant in each other's loop. if (const SCEVAddRecExpr *AR = dyn_cast(RHS)) { const Loop *L = AR->getLoop(); if (isLoopInvariant(LHS, L) && properlyDominates(LHS, L->getHeader())) { std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); Changed = true; } } // If there's a constant operand, canonicalize comparisons with boundary // cases, and canonicalize *-or-equal comparisons to regular comparisons. if (const SCEVConstant *RC = dyn_cast(RHS)) { const APInt &RA = RC->getAPInt(); switch (Pred) { default: llvm_unreachable("Unexpected ICmpInst::Predicate value!"); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_NE: // Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b. if (!RA) if (const SCEVAddExpr *AE = dyn_cast(LHS)) if (const SCEVMulExpr *ME = dyn_cast(AE->getOperand(0))) if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) { RHS = AE->getOperand(1); LHS = ME->getOperand(1); Changed = true; } break; case ICmpInst::ICMP_UGE: if ((RA - 1).isMinValue()) { Pred = ICmpInst::ICMP_NE; RHS = getConstant(RA - 1); Changed = true; break; } if (RA.isMaxValue()) { Pred = ICmpInst::ICMP_EQ; Changed = true; break; } if (RA.isMinValue()) goto trivially_true; Pred = ICmpInst::ICMP_UGT; RHS = getConstant(RA - 1); Changed = true; break; case ICmpInst::ICMP_ULE: if ((RA + 1).isMaxValue()) { Pred = ICmpInst::ICMP_NE; RHS = getConstant(RA + 1); Changed = true; break; } if (RA.isMinValue()) { Pred = ICmpInst::ICMP_EQ; Changed = true; break; } if (RA.isMaxValue()) goto trivially_true; Pred = ICmpInst::ICMP_ULT; RHS = getConstant(RA + 1); Changed = true; break; case ICmpInst::ICMP_SGE: if ((RA - 1).isMinSignedValue()) { Pred = ICmpInst::ICMP_NE; RHS = getConstant(RA - 1); Changed = true; break; } if (RA.isMaxSignedValue()) { Pred = ICmpInst::ICMP_EQ; Changed = true; break; } if (RA.isMinSignedValue()) goto trivially_true; Pred = ICmpInst::ICMP_SGT; RHS = getConstant(RA - 1); Changed = true; break; case ICmpInst::ICMP_SLE: if ((RA + 1).isMaxSignedValue()) { Pred = ICmpInst::ICMP_NE; RHS = getConstant(RA + 1); Changed = true; break; } if (RA.isMinSignedValue()) { Pred = ICmpInst::ICMP_EQ; Changed = true; break; } if (RA.isMaxSignedValue()) goto trivially_true; Pred = ICmpInst::ICMP_SLT; RHS = getConstant(RA + 1); Changed = true; break; case ICmpInst::ICMP_UGT: if (RA.isMinValue()) { Pred = ICmpInst::ICMP_NE; Changed = true; break; } if ((RA + 1).isMaxValue()) { Pred = ICmpInst::ICMP_EQ; RHS = getConstant(RA + 1); Changed = true; break; } if (RA.isMaxValue()) goto trivially_false; break; case ICmpInst::ICMP_ULT: if (RA.isMaxValue()) { Pred = ICmpInst::ICMP_NE; Changed = true; break; } if ((RA - 1).isMinValue()) { Pred = ICmpInst::ICMP_EQ; RHS = getConstant(RA - 1); Changed = true; break; } if (RA.isMinValue()) goto trivially_false; break; case ICmpInst::ICMP_SGT: if (RA.isMinSignedValue()) { Pred = ICmpInst::ICMP_NE; Changed = true; break; } if ((RA + 1).isMaxSignedValue()) { Pred = ICmpInst::ICMP_EQ; RHS = getConstant(RA + 1); Changed = true; break; } if (RA.isMaxSignedValue()) goto trivially_false; break; case ICmpInst::ICMP_SLT: if (RA.isMaxSignedValue()) { Pred = ICmpInst::ICMP_NE; Changed = true; break; } if ((RA - 1).isMinSignedValue()) { Pred = ICmpInst::ICMP_EQ; RHS = getConstant(RA - 1); Changed = true; break; } if (RA.isMinSignedValue()) goto trivially_false; break; } } // Check for obvious equality. if (HasSameValue(LHS, RHS)) { if (ICmpInst::isTrueWhenEqual(Pred)) goto trivially_true; if (ICmpInst::isFalseWhenEqual(Pred)) goto trivially_false; } // If possible, canonicalize GE/LE comparisons to GT/LT comparisons, by // adding or subtracting 1 from one of the operands. switch (Pred) { case ICmpInst::ICMP_SLE: if (!getSignedRange(RHS).getSignedMax().isMaxSignedValue()) { RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SLT; Changed = true; } else if (!getSignedRange(LHS).getSignedMin().isMinSignedValue()) { LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SLT; Changed = true; } break; case ICmpInst::ICMP_SGE: if (!getSignedRange(RHS).getSignedMin().isMinSignedValue()) { RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SGT; Changed = true; } else if (!getSignedRange(LHS).getSignedMax().isMaxSignedValue()) { LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS, SCEV::FlagNSW); Pred = ICmpInst::ICMP_SGT; Changed = true; } break; case ICmpInst::ICMP_ULE: if (!getUnsignedRange(RHS).getUnsignedMax().isMaxValue()) { RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS, SCEV::FlagNUW); Pred = ICmpInst::ICMP_ULT; Changed = true; } else if (!getUnsignedRange(LHS).getUnsignedMin().isMinValue()) { LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS); Pred = ICmpInst::ICMP_ULT; Changed = true; } break; case ICmpInst::ICMP_UGE: if (!getUnsignedRange(RHS).getUnsignedMin().isMinValue()) { RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS); Pred = ICmpInst::ICMP_UGT; Changed = true; } else if (!getUnsignedRange(LHS).getUnsignedMax().isMaxValue()) { LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS, SCEV::FlagNUW); Pred = ICmpInst::ICMP_UGT; Changed = true; } break; default: break; } // TODO: More simplifications are possible here. // Recursively simplify until we either hit a recursion limit or nothing // changes. if (Changed) return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1); return Changed; trivially_true: // Return 0 == 0. LHS = RHS = getConstant(ConstantInt::getFalse(getContext())); Pred = ICmpInst::ICMP_EQ; return true; trivially_false: // Return 0 != 0. LHS = RHS = getConstant(ConstantInt::getFalse(getContext())); Pred = ICmpInst::ICMP_NE; return true; } bool ScalarEvolution::isKnownNegative(const SCEV *S) { return getSignedRange(S).getSignedMax().isNegative(); } bool ScalarEvolution::isKnownPositive(const SCEV *S) { return getSignedRange(S).getSignedMin().isStrictlyPositive(); } bool ScalarEvolution::isKnownNonNegative(const SCEV *S) { return !getSignedRange(S).getSignedMin().isNegative(); } bool ScalarEvolution::isKnownNonPositive(const SCEV *S) { return !getSignedRange(S).getSignedMax().isStrictlyPositive(); } bool ScalarEvolution::isKnownNonZero(const SCEV *S) { return isKnownNegative(S) || isKnownPositive(S); } bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Canonicalize the inputs first. (void)SimplifyICmpOperands(Pred, LHS, RHS); // If LHS or RHS is an addrec, check to see if the condition is true in // every iteration of the loop. // If LHS and RHS are both addrec, both conditions must be true in // every iteration of the loop. const SCEVAddRecExpr *LAR = dyn_cast(LHS); const SCEVAddRecExpr *RAR = dyn_cast(RHS); bool LeftGuarded = false; bool RightGuarded = false; if (LAR) { const Loop *L = LAR->getLoop(); if (isLoopEntryGuardedByCond(L, Pred, LAR->getStart(), RHS) && isLoopBackedgeGuardedByCond(L, Pred, LAR->getPostIncExpr(*this), RHS)) { if (!RAR) return true; LeftGuarded = true; } } if (RAR) { const Loop *L = RAR->getLoop(); if (isLoopEntryGuardedByCond(L, Pred, LHS, RAR->getStart()) && isLoopBackedgeGuardedByCond(L, Pred, LHS, RAR->getPostIncExpr(*this))) { if (!LAR) return true; RightGuarded = true; } } if (LeftGuarded && RightGuarded) return true; if (isKnownPredicateViaSplitting(Pred, LHS, RHS)) return true; // Otherwise see what can be done with known constant ranges. return isKnownPredicateViaConstantRanges(Pred, LHS, RHS); } bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS, ICmpInst::Predicate Pred, bool &Increasing) { bool Result = isMonotonicPredicateImpl(LHS, Pred, Increasing); #ifndef NDEBUG // Verify an invariant: inverting the predicate should turn a monotonically // increasing change to a monotonically decreasing one, and vice versa. bool IncreasingSwapped; bool ResultSwapped = isMonotonicPredicateImpl( LHS, ICmpInst::getSwappedPredicate(Pred), IncreasingSwapped); assert(Result == ResultSwapped && "should be able to analyze both!"); if (ResultSwapped) assert(Increasing == !IncreasingSwapped && "monotonicity should flip as we flip the predicate"); #endif return Result; } bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS, ICmpInst::Predicate Pred, bool &Increasing) { // A zero step value for LHS means the induction variable is essentially a // loop invariant value. We don't really depend on the predicate actually // flipping from false to true (for increasing predicates, and the other way // around for decreasing predicates), all we care about is that *if* the // predicate changes then it only changes from false to true. // // A zero step value in itself is not very useful, but there may be places // where SCEV can prove X >= 0 but not prove X > 0, so it is helpful to be // as general as possible. switch (Pred) { default: return false; // Conservative answer case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: if (!LHS->hasNoUnsignedWrap()) return false; Increasing = Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE; return true; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: { if (!LHS->hasNoSignedWrap()) return false; const SCEV *Step = LHS->getStepRecurrence(*this); if (isKnownNonNegative(Step)) { Increasing = Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE; return true; } if (isKnownNonPositive(Step)) { Increasing = Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE; return true; } return false; } } llvm_unreachable("switch has default clause!"); } bool ScalarEvolution::isLoopInvariantPredicate( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L, ICmpInst::Predicate &InvariantPred, const SCEV *&InvariantLHS, const SCEV *&InvariantRHS) { // If there is a loop-invariant, force it into the RHS, otherwise bail out. if (!isLoopInvariant(RHS, L)) { if (!isLoopInvariant(LHS, L)) return false; std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); } const SCEVAddRecExpr *ArLHS = dyn_cast(LHS); if (!ArLHS || ArLHS->getLoop() != L) return false; bool Increasing; if (!isMonotonicPredicate(ArLHS, Pred, Increasing)) return false; // If the predicate "ArLHS `Pred` RHS" monotonically increases from false to // true as the loop iterates, and the backedge is control dependent on // "ArLHS `Pred` RHS" == true then we can reason as follows: // // * if the predicate was false in the first iteration then the predicate // is never evaluated again, since the loop exits without taking the // backedge. // * if the predicate was true in the first iteration then it will // continue to be true for all future iterations since it is // monotonically increasing. // // For both the above possibilities, we can replace the loop varying // predicate with its value on the first iteration of the loop (which is // loop invariant). // // A similar reasoning applies for a monotonically decreasing predicate, by // replacing true with false and false with true in the above two bullets. auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred); if (!isLoopBackedgeGuardedByCond(L, P, LHS, RHS)) return false; InvariantPred = Pred; InvariantLHS = ArLHS->getStart(); InvariantRHS = RHS; return true; } bool ScalarEvolution::isKnownPredicateViaConstantRanges( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { if (HasSameValue(LHS, RHS)) return ICmpInst::isTrueWhenEqual(Pred); // This code is split out from isKnownPredicate because it is called from // within isLoopEntryGuardedByCond. auto CheckRanges = [&](const ConstantRange &RangeLHS, const ConstantRange &RangeRHS) { return ConstantRange::makeSatisfyingICmpRegion(Pred, RangeRHS) .contains(RangeLHS); }; // The check at the top of the function catches the case where the values are // known to be equal. if (Pred == CmpInst::ICMP_EQ) return false; if (Pred == CmpInst::ICMP_NE) return CheckRanges(getSignedRange(LHS), getSignedRange(RHS)) || CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS)) || isKnownNonZero(getMinusSCEV(LHS, RHS)); if (CmpInst::isSigned(Pred)) return CheckRanges(getSignedRange(LHS), getSignedRange(RHS)); return CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS)); } bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Match Result to (X + Y) where Y is a constant integer. // Return Y via OutY. auto MatchBinaryAddToConst = [this](const SCEV *Result, const SCEV *X, APInt &OutY, SCEV::NoWrapFlags ExpectedFlags) { const SCEV *NonConstOp, *ConstOp; SCEV::NoWrapFlags FlagsPresent; if (!splitBinaryAdd(Result, ConstOp, NonConstOp, FlagsPresent) || !isa(ConstOp) || NonConstOp != X) return false; OutY = cast(ConstOp)->getAPInt(); return (FlagsPresent & ExpectedFlags) == ExpectedFlags; }; APInt C; switch (Pred) { default: break; case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); case ICmpInst::ICMP_SLE: // X s<= (X + C) if C >= 0 if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative()) return true; // (X + C) s<= X if C <= 0 if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && !C.isStrictlyPositive()) return true; break; case ICmpInst::ICMP_SGT: std::swap(LHS, RHS); case ICmpInst::ICMP_SLT: // X s< (X + C) if C > 0 if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isStrictlyPositive()) return true; // (X + C) s< X if C < 0 if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && C.isNegative()) return true; break; } return false; } bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { if (Pred != ICmpInst::ICMP_ULT || ProvingSplitPredicate) return false; // Allowing arbitrary number of activations of isKnownPredicateViaSplitting on // the stack can result in exponential time complexity. SaveAndRestore Restore(ProvingSplitPredicate, true); // If L >= 0 then I `ult` L <=> I >= 0 && I `slt` L // // To prove L >= 0 we use isKnownNonNegative whereas to prove I >= 0 we use // isKnownPredicate. isKnownPredicate is more powerful, but also more // expensive; and using isKnownNonNegative(RHS) is sufficient for most of the // interesting cases seen in practice. We can consider "upgrading" L >= 0 to // use isKnownPredicate later if needed. return isKnownNonNegative(RHS) && isKnownPredicate(CmpInst::ICMP_SGE, LHS, getZero(LHS->getType())) && isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS); } bool ScalarEvolution::isImpliedViaGuard(BasicBlock *BB, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // No need to even try if we know the module has no guards. if (!HasGuards) return false; return any_of(*BB, [&](Instruction &I) { using namespace llvm::PatternMatch; Value *Condition; return match(&I, m_Intrinsic( m_Value(Condition))) && isImpliedCond(Pred, LHS, RHS, Condition, false); }); } /// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is /// protected by a conditional between LHS and RHS. This is used to /// to eliminate casts. bool ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Interpret a null as meaning no loop, where there is obviously no guard // (interprocedural conditions notwithstanding). if (!L) return true; if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS)) return true; BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return false; BranchInst *LoopContinuePredicate = dyn_cast(Latch->getTerminator()); if (LoopContinuePredicate && LoopContinuePredicate->isConditional() && isImpliedCond(Pred, LHS, RHS, LoopContinuePredicate->getCondition(), LoopContinuePredicate->getSuccessor(0) != L->getHeader())) return true; // We don't want more than one activation of the following loops on the stack // -- that can lead to O(n!) time complexity. if (WalkingBEDominatingConds) return false; SaveAndRestore ClearOnExit(WalkingBEDominatingConds, true); // See if we can exploit a trip count to prove the predicate. const auto &BETakenInfo = getBackedgeTakenInfo(L); const SCEV *LatchBECount = BETakenInfo.getExact(Latch, this); if (LatchBECount != getCouldNotCompute()) { // We know that Latch branches back to the loop header exactly // LatchBECount times. This means the backdege condition at Latch is // equivalent to "{0,+,1} u< LatchBECount". Type *Ty = LatchBECount->getType(); auto NoWrapFlags = SCEV::NoWrapFlags(SCEV::FlagNUW | SCEV::FlagNW); const SCEV *LoopCounter = getAddRecExpr(getZero(Ty), getOne(Ty), L, NoWrapFlags); if (isImpliedCond(Pred, LHS, RHS, ICmpInst::ICMP_ULT, LoopCounter, LatchBECount)) return true; } // Check conditions due to any @llvm.assume intrinsics. for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *CI = cast(AssumeVH); if (!DT.dominates(CI, Latch->getTerminator())) continue; if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false)) return true; } // If the loop is not reachable from the entry block, we risk running into an // infinite loop as we walk up into the dom tree. These loops do not matter // anyway, so we just return a conservative answer when we see them. if (!DT.isReachableFromEntry(L->getHeader())) return false; if (isImpliedViaGuard(Latch, Pred, LHS, RHS)) return true; for (DomTreeNode *DTN = DT[Latch], *HeaderDTN = DT[L->getHeader()]; DTN != HeaderDTN; DTN = DTN->getIDom()) { assert(DTN && "should reach the loop header before reaching the root!"); BasicBlock *BB = DTN->getBlock(); if (isImpliedViaGuard(BB, Pred, LHS, RHS)) return true; BasicBlock *PBB = BB->getSinglePredecessor(); if (!PBB) continue; BranchInst *ContinuePredicate = dyn_cast(PBB->getTerminator()); if (!ContinuePredicate || !ContinuePredicate->isConditional()) continue; Value *Condition = ContinuePredicate->getCondition(); // If we have an edge `E` within the loop body that dominates the only // latch, the condition guarding `E` also guards the backedge. This // reasoning works only for loops with a single latch. BasicBlockEdge DominatingEdge(PBB, BB); if (DominatingEdge.isSingleEdge()) { // We're constructively (and conservatively) enumerating edges within the // loop body that dominate the latch. The dominator tree better agree // with us on this: assert(DT.dominates(DominatingEdge, Latch) && "should be!"); if (isImpliedCond(Pred, LHS, RHS, Condition, BB != ContinuePredicate->getSuccessor(0))) return true; } } return false; } bool ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // Interpret a null as meaning no loop, where there is obviously no guard // (interprocedural conditions notwithstanding). if (!L) return false; if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS)) return true; // Starting at the loop predecessor, climb up the predecessor chain, as long // as there are predecessors that can be found that have unique successors // leading to the original header. for (std::pair Pair(L->getLoopPredecessor(), L->getHeader()); Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { if (isImpliedViaGuard(Pair.first, Pred, LHS, RHS)) return true; BranchInst *LoopEntryPredicate = dyn_cast(Pair.first->getTerminator()); if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional()) continue; if (isImpliedCond(Pred, LHS, RHS, LoopEntryPredicate->getCondition(), LoopEntryPredicate->getSuccessor(0) != Pair.second)) return true; } // Check conditions due to any @llvm.assume intrinsics. for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *CI = cast(AssumeVH); if (!DT.dominates(CI, L->getHeader())) continue; if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false)) return true; } return false; } namespace { /// RAII wrapper to prevent recursive application of isImpliedCond. /// ScalarEvolution's PendingLoopPredicates set must be empty unless we are /// currently evaluating isImpliedCond. struct MarkPendingLoopPredicate { Value *Cond; DenseSet &LoopPreds; bool Pending; MarkPendingLoopPredicate(Value *C, DenseSet &LP) : Cond(C), LoopPreds(LP) { Pending = !LoopPreds.insert(Cond).second; } ~MarkPendingLoopPredicate() { if (!Pending) LoopPreds.erase(Cond); } }; } // end anonymous namespace bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, Value *FoundCondValue, bool Inverse) { MarkPendingLoopPredicate Mark(FoundCondValue, PendingLoopPredicates); if (Mark.Pending) return false; // Recursively handle And and Or conditions. if (BinaryOperator *BO = dyn_cast(FoundCondValue)) { if (BO->getOpcode() == Instruction::And) { if (!Inverse) return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) || isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse); } else if (BO->getOpcode() == Instruction::Or) { if (Inverse) return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) || isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse); } } ICmpInst *ICI = dyn_cast(FoundCondValue); if (!ICI) return false; // Now that we found a conditional branch that dominates the loop or controls // the loop latch. Check to see if it is the comparison we are looking for. ICmpInst::Predicate FoundPred; if (Inverse) FoundPred = ICI->getInversePredicate(); else FoundPred = ICI->getPredicate(); const SCEV *FoundLHS = getSCEV(ICI->getOperand(0)); const SCEV *FoundRHS = getSCEV(ICI->getOperand(1)); return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS); } bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, const SCEV *FoundRHS) { // Balance the types. if (getTypeSizeInBits(LHS->getType()) < getTypeSizeInBits(FoundLHS->getType())) { if (CmpInst::isSigned(Pred)) { LHS = getSignExtendExpr(LHS, FoundLHS->getType()); RHS = getSignExtendExpr(RHS, FoundLHS->getType()); } else { LHS = getZeroExtendExpr(LHS, FoundLHS->getType()); RHS = getZeroExtendExpr(RHS, FoundLHS->getType()); } } else if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(FoundLHS->getType())) { if (CmpInst::isSigned(FoundPred)) { FoundLHS = getSignExtendExpr(FoundLHS, LHS->getType()); FoundRHS = getSignExtendExpr(FoundRHS, LHS->getType()); } else { FoundLHS = getZeroExtendExpr(FoundLHS, LHS->getType()); FoundRHS = getZeroExtendExpr(FoundRHS, LHS->getType()); } } // Canonicalize the query to match the way instcombine will have // canonicalized the comparison. if (SimplifyICmpOperands(Pred, LHS, RHS)) if (LHS == RHS) return CmpInst::isTrueWhenEqual(Pred); if (SimplifyICmpOperands(FoundPred, FoundLHS, FoundRHS)) if (FoundLHS == FoundRHS) return CmpInst::isFalseWhenEqual(FoundPred); // Check to see if we can make the LHS or RHS match. if (LHS == FoundRHS || RHS == FoundLHS) { if (isa(RHS)) { std::swap(FoundLHS, FoundRHS); FoundPred = ICmpInst::getSwappedPredicate(FoundPred); } else { std::swap(LHS, RHS); Pred = ICmpInst::getSwappedPredicate(Pred); } } // Check whether the found predicate is the same as the desired predicate. if (FoundPred == Pred) return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS); // Check whether swapping the found predicate makes it the same as the // desired predicate. if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) { if (isa(RHS)) return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS); else return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred), RHS, LHS, FoundLHS, FoundRHS); } // Unsigned comparison is the same as signed comparison when both the operands // are non-negative. if (CmpInst::isUnsigned(FoundPred) && CmpInst::getSignedPredicate(FoundPred) == Pred && isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS)) return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS); // Check if we can make progress by sharpening ranges. if (FoundPred == ICmpInst::ICMP_NE && (isa(FoundLHS) || isa(FoundRHS))) { const SCEVConstant *C = nullptr; const SCEV *V = nullptr; if (isa(FoundLHS)) { C = cast(FoundLHS); V = FoundRHS; } else { C = cast(FoundRHS); V = FoundLHS; } // The guarding predicate tells us that C != V. If the known range // of V is [C, t), we can sharpen the range to [C + 1, t). The // range we consider has to correspond to same signedness as the // predicate we're interested in folding. APInt Min = ICmpInst::isSigned(Pred) ? getSignedRange(V).getSignedMin() : getUnsignedRange(V).getUnsignedMin(); if (Min == C->getAPInt()) { // Given (V >= Min && V != Min) we conclude V >= (Min + 1). // This is true even if (Min + 1) wraps around -- in case of // wraparound, (Min + 1) < Min, so (V >= Min => V >= (Min + 1)). APInt SharperMin = Min + 1; switch (Pred) { case ICmpInst::ICMP_SGE: case ICmpInst::ICMP_UGE: // We know V `Pred` SharperMin. If this implies LHS `Pred` // RHS, we're done. if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin))) return true; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_UGT: // We know from the range information that (V `Pred` Min || // V == Min). We know from the guarding condition that !(V // == Min). This gives us // // V `Pred` Min || V == Min && !(V == Min) // => V `Pred` Min // // If V `Pred` Min implies LHS `Pred` RHS, we're done. if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min))) return true; default: // No change break; } } } // Check whether the actual condition is beyond sufficient. if (FoundPred == ICmpInst::ICMP_EQ) if (ICmpInst::isTrueWhenEqual(Pred)) if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; if (Pred == ICmpInst::ICMP_NE) if (!ICmpInst::isTrueWhenEqual(FoundPred)) if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS)) return true; // Otherwise assume the worst. return false; } bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R, SCEV::NoWrapFlags &Flags) { const auto *AE = dyn_cast(Expr); if (!AE || AE->getNumOperands() != 2) return false; L = AE->getOperand(0); R = AE->getOperand(1); Flags = AE->getNoWrapFlags(); return true; } bool ScalarEvolution::computeConstantDifference(const SCEV *Less, const SCEV *More, APInt &C) { // We avoid subtracting expressions here because this function is usually // fairly deep in the call stack (i.e. is called many times). if (isa(Less) && isa(More)) { const auto *LAR = cast(Less); const auto *MAR = cast(More); if (LAR->getLoop() != MAR->getLoop()) return false; // We look at affine expressions only; not for correctness but to keep // getStepRecurrence cheap. if (!LAR->isAffine() || !MAR->isAffine()) return false; if (LAR->getStepRecurrence(*this) != MAR->getStepRecurrence(*this)) return false; Less = LAR->getStart(); More = MAR->getStart(); // fall through } if (isa(Less) && isa(More)) { const auto &M = cast(More)->getAPInt(); const auto &L = cast(Less)->getAPInt(); C = M - L; return true; } const SCEV *L, *R; SCEV::NoWrapFlags Flags; if (splitBinaryAdd(Less, L, R, Flags)) if (const auto *LC = dyn_cast(L)) if (R == More) { C = -(LC->getAPInt()); return true; } if (splitBinaryAdd(More, L, R, Flags)) if (const auto *LC = dyn_cast(L)) if (R == Less) { C = LC->getAPInt(); return true; } return false; } bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow( ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_ULT) return false; const auto *AddRecLHS = dyn_cast(LHS); if (!AddRecLHS) return false; const auto *AddRecFoundLHS = dyn_cast(FoundLHS); if (!AddRecFoundLHS) return false; // We'd like to let SCEV reason about control dependencies, so we constrain // both the inequalities to be about add recurrences on the same loop. This // way we can use isLoopEntryGuardedByCond later. const Loop *L = AddRecFoundLHS->getLoop(); if (L != AddRecLHS->getLoop()) return false; // FoundLHS u< FoundRHS u< -C => (FoundLHS + C) u< (FoundRHS + C) ... (1) // // FoundLHS s< FoundRHS s< INT_MIN - C => (FoundLHS + C) s< (FoundRHS + C) // ... (2) // // Informal proof for (2), assuming (1) [*]: // // We'll also assume (A s< B) <=> ((A + INT_MIN) u< (B + INT_MIN)) ... (3)[**] // // Then // // FoundLHS s< FoundRHS s< INT_MIN - C // <=> (FoundLHS + INT_MIN) u< (FoundRHS + INT_MIN) u< -C [ using (3) ] // <=> (FoundLHS + INT_MIN + C) u< (FoundRHS + INT_MIN + C) [ using (1) ] // <=> (FoundLHS + INT_MIN + C + INT_MIN) s< // (FoundRHS + INT_MIN + C + INT_MIN) [ using (3) ] // <=> FoundLHS + C s< FoundRHS + C // // [*]: (1) can be proved by ruling out overflow. // // [**]: This can be proved by analyzing all the four possibilities: // (A s< 0, B s< 0), (A s< 0, B s>= 0), (A s>= 0, B s< 0) and // (A s>= 0, B s>= 0). // // Note: // Despite (2), "FoundRHS s< INT_MIN - C" does not mean that "FoundRHS + C" // will not sign underflow. For instance, say FoundLHS = (i8 -128), FoundRHS // = (i8 -127) and C = (i8 -100). Then INT_MIN - C = (i8 -28), and FoundRHS // s< (INT_MIN - C). Lack of sign overflow / underflow in "FoundRHS + C" is // neither necessary nor sufficient to prove "(FoundLHS + C) s< (FoundRHS + // C)". APInt LDiff, RDiff; if (!computeConstantDifference(FoundLHS, LHS, LDiff) || !computeConstantDifference(FoundRHS, RHS, RDiff) || LDiff != RDiff) return false; if (LDiff == 0) return true; APInt FoundRHSLimit; if (Pred == CmpInst::ICMP_ULT) { FoundRHSLimit = -RDiff; } else { assert(Pred == CmpInst::ICMP_SLT && "Checked above!"); FoundRHSLimit = APInt::getSignedMinValue(getTypeSizeInBits(RHS->getType())) - RDiff; } // Try to prove (1) or (2), as needed. return isLoopEntryGuardedByCond(L, Pred, FoundRHS, getConstant(FoundRHSLimit)); } bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; return isImpliedCondOperandsHelper(Pred, LHS, RHS, FoundLHS, FoundRHS) || // ~x < ~y --> x > y isImpliedCondOperandsHelper(Pred, LHS, RHS, getNotSCEV(FoundRHS), getNotSCEV(FoundLHS)); } /// If Expr computes ~A, return A else return nullptr static const SCEV *MatchNotExpr(const SCEV *Expr) { const SCEVAddExpr *Add = dyn_cast(Expr); if (!Add || Add->getNumOperands() != 2 || !Add->getOperand(0)->isAllOnesValue()) return nullptr; const SCEVMulExpr *AddRHS = dyn_cast(Add->getOperand(1)); if (!AddRHS || AddRHS->getNumOperands() != 2 || !AddRHS->getOperand(0)->isAllOnesValue()) return nullptr; return AddRHS->getOperand(1); } /// Is MaybeMaxExpr an SMax or UMax of Candidate and some other values? template static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr, const SCEV *Candidate) { const MaxExprType *MaxExpr = dyn_cast(MaybeMaxExpr); if (!MaxExpr) return false; return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end(); } /// Is MaybeMinExpr an SMin or UMin of Candidate and some other values? template static bool IsMinConsistingOf(ScalarEvolution &SE, const SCEV *MaybeMinExpr, const SCEV *Candidate) { const SCEV *MaybeMaxExpr = MatchNotExpr(MaybeMinExpr); if (!MaybeMaxExpr) return false; return IsMaxConsistingOf(MaybeMaxExpr, SE.getNotSCEV(Candidate)); } static bool IsKnownPredicateViaAddRecStart(ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { // If both sides are affine addrecs for the same loop, with equal // steps, and we know the recurrences don't wrap, then we only // need to check the predicate on the starting values. if (!ICmpInst::isRelational(Pred)) return false; const SCEVAddRecExpr *LAR = dyn_cast(LHS); if (!LAR) return false; const SCEVAddRecExpr *RAR = dyn_cast(RHS); if (!RAR) return false; if (LAR->getLoop() != RAR->getLoop()) return false; if (!LAR->isAffine() || !RAR->isAffine()) return false; if (LAR->getStepRecurrence(SE) != RAR->getStepRecurrence(SE)) return false; SCEV::NoWrapFlags NW = ICmpInst::isSigned(Pred) ? SCEV::FlagNSW : SCEV::FlagNUW; if (!LAR->getNoWrapFlags(NW) || !RAR->getNoWrapFlags(NW)) return false; return SE.isKnownPredicate(Pred, LAR->getStart(), RAR->getStart()); } /// Is LHS `Pred` RHS true on the virtue of LHS or RHS being a Min or Max /// expression? static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { switch (Pred) { default: return false; case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); // fall through case ICmpInst::ICMP_SLE: return // min(A, ...) <= A IsMinConsistingOf(SE, LHS, RHS) || // A <= max(A, ...) IsMaxConsistingOf(RHS, LHS); case ICmpInst::ICMP_UGE: std::swap(LHS, RHS); // fall through case ICmpInst::ICMP_ULE: return // min(A, ...) <= A IsMinConsistingOf(SE, LHS, RHS) || // A <= max(A, ...) IsMaxConsistingOf(RHS, LHS); } llvm_unreachable("covered switch fell through?!"); } bool ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { auto IsKnownPredicateFull = [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) || IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) || IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) || isKnownPredicateViaNoOverflow(Pred, LHS, RHS); }; switch (Pred) { default: llvm_unreachable("Unexpected ICmpInst::Predicate value!"); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_NE: if (HasSameValue(LHS, FoundLHS) && HasSameValue(RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: if (IsKnownPredicateFull(ICmpInst::ICMP_SLE, LHS, FoundLHS) && IsKnownPredicateFull(ICmpInst::ICMP_SGE, RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: if (IsKnownPredicateFull(ICmpInst::ICMP_SGE, LHS, FoundLHS) && IsKnownPredicateFull(ICmpInst::ICMP_SLE, RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: if (IsKnownPredicateFull(ICmpInst::ICMP_ULE, LHS, FoundLHS) && IsKnownPredicateFull(ICmpInst::ICMP_UGE, RHS, FoundRHS)) return true; break; case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: if (IsKnownPredicateFull(ICmpInst::ICMP_UGE, LHS, FoundLHS) && IsKnownPredicateFull(ICmpInst::ICMP_ULE, RHS, FoundRHS)) return true; break; } return false; } bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, const SCEV *FoundRHS) { if (!isa(RHS) || !isa(FoundRHS)) // The restriction on `FoundRHS` be lifted easily -- it exists only to // reduce the compile time impact of this optimization. return false; const SCEVAddExpr *AddLHS = dyn_cast(LHS); if (!AddLHS || AddLHS->getOperand(1) != FoundLHS || !isa(AddLHS->getOperand(0))) return false; APInt ConstFoundRHS = cast(FoundRHS)->getAPInt(); // `FoundLHSRange` is the range we know `FoundLHS` to be in by virtue of the // antecedent "`FoundLHS` `Pred` `FoundRHS`". ConstantRange FoundLHSRange = ConstantRange::makeAllowedICmpRegion(Pred, ConstFoundRHS); // Since `LHS` is `FoundLHS` + `AddLHS->getOperand(0)`, we can compute a range // for `LHS`: APInt Addend = cast(AddLHS->getOperand(0))->getAPInt(); ConstantRange LHSRange = FoundLHSRange.add(ConstantRange(Addend)); // We can also compute the range of values for `LHS` that satisfy the // consequent, "`LHS` `Pred` `RHS`": APInt ConstRHS = cast(RHS)->getAPInt(); ConstantRange SatisfyingLHSRange = ConstantRange::makeSatisfyingICmpRegion(Pred, ConstRHS); // The antecedent implies the consequent if every value of `LHS` that // satisfies the antecedent also satisfies the consequent. return SatisfyingLHSRange.contains(LHSRange); } bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride, bool IsSigned, bool NoWrap) { if (NoWrap) return false; unsigned BitWidth = getTypeSizeInBits(RHS->getType()); const SCEV *One = getOne(Stride->getType()); if (IsSigned) { APInt MaxRHS = getSignedRange(RHS).getSignedMax(); APInt MaxValue = APInt::getSignedMaxValue(BitWidth); APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One)) .getSignedMax(); // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow! return (MaxValue - MaxStrideMinusOne).slt(MaxRHS); } APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax(); APInt MaxValue = APInt::getMaxValue(BitWidth); APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One)) .getUnsignedMax(); // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow! return (MaxValue - MaxStrideMinusOne).ult(MaxRHS); } bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, bool IsSigned, bool NoWrap) { if (NoWrap) return false; unsigned BitWidth = getTypeSizeInBits(RHS->getType()); const SCEV *One = getOne(Stride->getType()); if (IsSigned) { APInt MinRHS = getSignedRange(RHS).getSignedMin(); APInt MinValue = APInt::getSignedMinValue(BitWidth); APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One)) .getSignedMax(); // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow! return (MinValue + MaxStrideMinusOne).sgt(MinRHS); } APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin(); APInt MinValue = APInt::getMinValue(BitWidth); APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One)) .getUnsignedMax(); // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow! return (MinValue + MaxStrideMinusOne).ugt(MinRHS); } const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, bool Equality) { const SCEV *One = getOne(Step->getType()); Delta = Equality ? getAddExpr(Delta, Step) : getAddExpr(Delta, getMinusSCEV(Step, One)); return getUDivExpr(Delta, Step); } ScalarEvolution::ExitLimit ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned, bool ControlsExit, bool AllowPredicates) { SCEVUnionPredicate P; // We handle only IV < Invariant if (!isLoopInvariant(RHS, L)) return getCouldNotCompute(); const SCEVAddRecExpr *IV = dyn_cast(LHS); if (!IV && AllowPredicates) // Try to make this an AddRec using runtime tests, in the first X // iterations of this loop, where X is the SCEV expression found by the // algorithm below. IV = convertSCEVToAddRecWithPredicates(LHS, L, P); // Avoid weird loops if (!IV || IV->getLoop() != L || !IV->isAffine()) return getCouldNotCompute(); bool NoWrap = ControlsExit && IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW); const SCEV *Stride = IV->getStepRecurrence(*this); // Avoid negative or zero stride values if (!isKnownPositive(Stride)) return getCouldNotCompute(); // Avoid proven overflow cases: this will ensure that the backedge taken count // will not generate any unsigned overflow. Relaxed no-overflow conditions // exploit NoWrapFlags, allowing to optimize in presence of undefined // behaviors like the case of C language. if (!Stride->isOne() && doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap)) return getCouldNotCompute(); ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; const SCEV *Start = IV->getStart(); const SCEV *End = RHS; if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS)) End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start); const SCEV *BECount = computeBECount(getMinusSCEV(End, Start), Stride, false); APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin() : getUnsignedRange(Start).getUnsignedMin(); APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin() : getUnsignedRange(Stride).getUnsignedMin(); unsigned BitWidth = getTypeSizeInBits(LHS->getType()); APInt Limit = IsSigned ? APInt::getSignedMaxValue(BitWidth) - (MinStride - 1) : APInt::getMaxValue(BitWidth) - (MinStride - 1); // Although End can be a MAX expression we estimate MaxEnd considering only // the case End = RHS. This is safe because in the other case (End - Start) // is zero, leading to a zero maximum backedge taken count. APInt MaxEnd = IsSigned ? APIntOps::smin(getSignedRange(RHS).getSignedMax(), Limit) : APIntOps::umin(getUnsignedRange(RHS).getUnsignedMax(), Limit); const SCEV *MaxBECount; if (isa(BECount)) MaxBECount = BECount; else MaxBECount = computeBECount(getConstant(MaxEnd - MinStart), getConstant(MinStride), false); if (isa(MaxBECount)) MaxBECount = BECount; return ExitLimit(BECount, MaxBECount, P); } ScalarEvolution::ExitLimit ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS, const Loop *L, bool IsSigned, bool ControlsExit, bool AllowPredicates) { SCEVUnionPredicate P; // We handle only IV > Invariant if (!isLoopInvariant(RHS, L)) return getCouldNotCompute(); const SCEVAddRecExpr *IV = dyn_cast(LHS); if (!IV && AllowPredicates) // Try to make this an AddRec using runtime tests, in the first X // iterations of this loop, where X is the SCEV expression found by the // algorithm below. IV = convertSCEVToAddRecWithPredicates(LHS, L, P); // Avoid weird loops if (!IV || IV->getLoop() != L || !IV->isAffine()) return getCouldNotCompute(); bool NoWrap = ControlsExit && IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW); const SCEV *Stride = getNegativeSCEV(IV->getStepRecurrence(*this)); // Avoid negative or zero stride values if (!isKnownPositive(Stride)) return getCouldNotCompute(); // Avoid proven overflow cases: this will ensure that the backedge taken count // will not generate any unsigned overflow. Relaxed no-overflow conditions // exploit NoWrapFlags, allowing to optimize in presence of undefined // behaviors like the case of C language. if (!Stride->isOne() && doesIVOverflowOnGT(RHS, Stride, IsSigned, NoWrap)) return getCouldNotCompute(); ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; const SCEV *Start = IV->getStart(); const SCEV *End = RHS; if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start); const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false); APInt MaxStart = IsSigned ? getSignedRange(Start).getSignedMax() : getUnsignedRange(Start).getUnsignedMax(); APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin() : getUnsignedRange(Stride).getUnsignedMin(); unsigned BitWidth = getTypeSizeInBits(LHS->getType()); APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1) : APInt::getMinValue(BitWidth) + (MinStride - 1); // Although End can be a MIN expression we estimate MinEnd considering only // the case End = RHS. This is safe because in the other case (Start - End) // is zero, leading to a zero maximum backedge taken count. APInt MinEnd = IsSigned ? APIntOps::smax(getSignedRange(RHS).getSignedMin(), Limit) : APIntOps::umax(getUnsignedRange(RHS).getUnsignedMin(), Limit); const SCEV *MaxBECount = getCouldNotCompute(); if (isa(BECount)) MaxBECount = BECount; else MaxBECount = computeBECount(getConstant(MaxStart - MinEnd), getConstant(MinStride), false); if (isa(MaxBECount)) MaxBECount = BECount; return ExitLimit(BECount, MaxBECount, P); } const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range, ScalarEvolution &SE) const { if (Range.isFullSet()) // Infinite loop. return SE.getCouldNotCompute(); // If the start is a non-zero constant, shift the range to simplify things. if (const SCEVConstant *SC = dyn_cast(getStart())) if (!SC->getValue()->isZero()) { SmallVector Operands(op_begin(), op_end()); Operands[0] = SE.getZero(SC->getType()); const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(), getNoWrapFlags(FlagNW)); if (const auto *ShiftedAddRec = dyn_cast(Shifted)) return ShiftedAddRec->getNumIterationsInRange( Range.subtract(SC->getAPInt()), SE); // This is strange and shouldn't happen. return SE.getCouldNotCompute(); } // The only time we can solve this is when we have all constant indices. // Otherwise, we cannot determine the overflow conditions. if (any_of(operands(), [](const SCEV *Op) { return !isa(Op); })) return SE.getCouldNotCompute(); // Okay at this point we know that all elements of the chrec are constants and // that the start element is zero. // First check to see if the range contains zero. If not, the first // iteration exits. unsigned BitWidth = SE.getTypeSizeInBits(getType()); if (!Range.contains(APInt(BitWidth, 0))) return SE.getZero(getType()); if (isAffine()) { // If this is an affine expression then we have this situation: // Solve {0,+,A} in Range === Ax in Range // We know that zero is in the range. If A is positive then we know that // the upper value of the range must be the first possible exit value. // If A is negative then the lower of the range is the last possible loop // value. Also note that we already checked for a full range. APInt One(BitWidth,1); APInt A = cast(getOperand(1))->getAPInt(); APInt End = A.sge(One) ? (Range.getUpper() - One) : Range.getLower(); // The exit value should be (End+A)/A. APInt ExitVal = (End + A).udiv(A); ConstantInt *ExitValue = ConstantInt::get(SE.getContext(), ExitVal); // Evaluate at the exit value. If we really did fall out of the valid // range, then we computed our trip count, otherwise wrap around or other // things must have happened. ConstantInt *Val = EvaluateConstantChrecAtConstant(this, ExitValue, SE); if (Range.contains(Val->getValue())) return SE.getCouldNotCompute(); // Something strange happened // Ensure that the previous value is in the range. This is a sanity check. assert(Range.contains( EvaluateConstantChrecAtConstant(this, ConstantInt::get(SE.getContext(), ExitVal - One), SE)->getValue()) && "Linear scev computation is off in a bad way!"); return SE.getConstant(ExitValue); } else if (isQuadratic()) { // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of the // quadratic equation to solve it. To do this, we must frame our problem in // terms of figuring out when zero is crossed, instead of when // Range.getUpper() is crossed. SmallVector NewOps(op_begin(), op_end()); NewOps[0] = SE.getNegativeSCEV(SE.getConstant(Range.getUpper())); const SCEV *NewAddRec = SE.getAddRecExpr(NewOps, getLoop(), // getNoWrapFlags(FlagNW) FlagAnyWrap); // Next, solve the constructed addrec if (auto Roots = SolveQuadraticEquation(cast(NewAddRec), SE)) { const SCEVConstant *R1 = Roots->first; const SCEVConstant *R2 = Roots->second; // Pick the smallest positive root value. if (ConstantInt *CB = dyn_cast(ConstantExpr::getICmp( ICmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) { if (!CB->getZExtValue()) std::swap(R1, R2); // R1 is the minimum root now. // Make sure the root is not off by one. The returned iteration should // not be in the range, but the previous one should be. When solving // for "X*X < 5", for example, we should not return a root of 2. ConstantInt *R1Val = EvaluateConstantChrecAtConstant(this, R1->getValue(), SE); if (Range.contains(R1Val->getValue())) { // The next iteration must be out of the range... ConstantInt *NextVal = ConstantInt::get(SE.getContext(), R1->getAPInt() + 1); R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE); if (!Range.contains(R1Val->getValue())) return SE.getConstant(NextVal); return SE.getCouldNotCompute(); // Something strange happened } // If R1 was not in the range, then it is a good return value. Make // sure that R1-1 WAS in the range though, just in case. ConstantInt *NextVal = ConstantInt::get(SE.getContext(), R1->getAPInt() - 1); R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE); if (Range.contains(R1Val->getValue())) return R1; return SE.getCouldNotCompute(); // Something strange happened } } } return SE.getCouldNotCompute(); } namespace { struct FindUndefs { bool Found; FindUndefs() : Found(false) {} bool follow(const SCEV *S) { if (const SCEVUnknown *C = dyn_cast(S)) { if (isa(C->getValue())) Found = true; } else if (const SCEVConstant *C = dyn_cast(S)) { if (isa(C->getValue())) Found = true; } // Keep looking if we haven't found it yet. return !Found; } bool isDone() const { // Stop recursion if we have found an undef. return Found; } }; } // Return true when S contains at least an undef value. static inline bool containsUndefs(const SCEV *S) { FindUndefs F; SCEVTraversal ST(F); ST.visitAll(S); return F.Found; } namespace { // Collect all steps of SCEV expressions. struct SCEVCollectStrides { ScalarEvolution &SE; SmallVectorImpl &Strides; SCEVCollectStrides(ScalarEvolution &SE, SmallVectorImpl &S) : SE(SE), Strides(S) {} bool follow(const SCEV *S) { if (const SCEVAddRecExpr *AR = dyn_cast(S)) Strides.push_back(AR->getStepRecurrence(SE)); return true; } bool isDone() const { return false; } }; // Collect all SCEVUnknown and SCEVMulExpr expressions. struct SCEVCollectTerms { SmallVectorImpl &Terms; SCEVCollectTerms(SmallVectorImpl &T) : Terms(T) {} bool follow(const SCEV *S) { if (isa(S) || isa(S)) { if (!containsUndefs(S)) Terms.push_back(S); // Stop recursion: once we collected a term, do not walk its operands. return false; } // Keep looking. return true; } bool isDone() const { return false; } }; // Check if a SCEV contains an AddRecExpr. struct SCEVHasAddRec { bool &ContainsAddRec; SCEVHasAddRec(bool &ContainsAddRec) : ContainsAddRec(ContainsAddRec) { ContainsAddRec = false; } bool follow(const SCEV *S) { if (isa(S)) { ContainsAddRec = true; // Stop recursion: once we collected a term, do not walk its operands. return false; } // Keep looking. return true; } bool isDone() const { return false; } }; // Find factors that are multiplied with an expression that (possibly as a // subexpression) contains an AddRecExpr. In the expression: // // 8 * (100 + %p * %q * (%a + {0, +, 1}_loop)) // // "%p * %q" are factors multiplied by the expression "(%a + {0, +, 1}_loop)" // that contains the AddRec {0, +, 1}_loop. %p * %q are likely to be array size // parameters as they form a product with an induction variable. // // This collector expects all array size parameters to be in the same MulExpr. // It might be necessary to later add support for collecting parameters that are // spread over different nested MulExpr. struct SCEVCollectAddRecMultiplies { SmallVectorImpl &Terms; ScalarEvolution &SE; SCEVCollectAddRecMultiplies(SmallVectorImpl &T, ScalarEvolution &SE) : Terms(T), SE(SE) {} bool follow(const SCEV *S) { if (auto *Mul = dyn_cast(S)) { bool HasAddRec = false; SmallVector Operands; for (auto Op : Mul->operands()) { if (isa(Op)) { Operands.push_back(Op); } else { bool ContainsAddRec; SCEVHasAddRec ContiansAddRec(ContainsAddRec); visitAll(Op, ContiansAddRec); HasAddRec |= ContainsAddRec; } } if (Operands.size() == 0) return true; if (!HasAddRec) return false; Terms.push_back(SE.getMulExpr(Operands)); // Stop recursion: once we collected a term, do not walk its operands. return false; } // Keep looking. return true; } bool isDone() const { return false; } }; } /// Find parametric terms in this SCEVAddRecExpr. We first for parameters in /// two places: /// 1) The strides of AddRec expressions. /// 2) Unknowns that are multiplied with AddRec expressions. void ScalarEvolution::collectParametricTerms(const SCEV *Expr, SmallVectorImpl &Terms) { SmallVector Strides; SCEVCollectStrides StrideCollector(*this, Strides); visitAll(Expr, StrideCollector); DEBUG({ dbgs() << "Strides:\n"; for (const SCEV *S : Strides) dbgs() << *S << "\n"; }); for (const SCEV *S : Strides) { SCEVCollectTerms TermCollector(Terms); visitAll(S, TermCollector); } DEBUG({ dbgs() << "Terms:\n"; for (const SCEV *T : Terms) dbgs() << *T << "\n"; }); SCEVCollectAddRecMultiplies MulCollector(Terms, *this); visitAll(Expr, MulCollector); } static bool findArrayDimensionsRec(ScalarEvolution &SE, SmallVectorImpl &Terms, SmallVectorImpl &Sizes) { int Last = Terms.size() - 1; const SCEV *Step = Terms[Last]; // End of recursion. if (Last == 0) { if (const SCEVMulExpr *M = dyn_cast(Step)) { SmallVector Qs; for (const SCEV *Op : M->operands()) if (!isa(Op)) Qs.push_back(Op); Step = SE.getMulExpr(Qs); } Sizes.push_back(Step); return true; } for (const SCEV *&Term : Terms) { // Normalize the terms before the next call to findArrayDimensionsRec. const SCEV *Q, *R; SCEVDivision::divide(SE, Term, Step, &Q, &R); // Bail out when GCD does not evenly divide one of the terms. if (!R->isZero()) return false; Term = Q; } // Remove all SCEVConstants. Terms.erase(std::remove_if(Terms.begin(), Terms.end(), [](const SCEV *E) { return isa(E); }), Terms.end()); if (Terms.size() > 0) if (!findArrayDimensionsRec(SE, Terms, Sizes)) return false; Sizes.push_back(Step); return true; } // Returns true when S contains at least a SCEVUnknown parameter. static inline bool containsParameters(const SCEV *S) { struct FindParameter { bool FoundParameter; FindParameter() : FoundParameter(false) {} bool follow(const SCEV *S) { if (isa(S)) { FoundParameter = true; // Stop recursion: we found a parameter. return false; } // Keep looking. return true; } bool isDone() const { // Stop recursion if we have found a parameter. return FoundParameter; } }; FindParameter F; SCEVTraversal ST(F); ST.visitAll(S); return F.FoundParameter; } // Returns true when one of the SCEVs of Terms contains a SCEVUnknown parameter. static inline bool containsParameters(SmallVectorImpl &Terms) { for (const SCEV *T : Terms) if (containsParameters(T)) return true; return false; } // Return the number of product terms in S. static inline int numberOfTerms(const SCEV *S) { if (const SCEVMulExpr *Expr = dyn_cast(S)) return Expr->getNumOperands(); return 1; } static const SCEV *removeConstantFactors(ScalarEvolution &SE, const SCEV *T) { if (isa(T)) return nullptr; if (isa(T)) return T; if (const SCEVMulExpr *M = dyn_cast(T)) { SmallVector Factors; for (const SCEV *Op : M->operands()) if (!isa(Op)) Factors.push_back(Op); return SE.getMulExpr(Factors); } return T; } /// Return the size of an element read or written by Inst. const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) { Type *Ty; if (StoreInst *Store = dyn_cast(Inst)) Ty = Store->getValueOperand()->getType(); else if (LoadInst *Load = dyn_cast(Inst)) Ty = Load->getType(); else return nullptr; Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Ty)); return getSizeOfExpr(ETy, Ty); } void ScalarEvolution::findArrayDimensions(SmallVectorImpl &Terms, SmallVectorImpl &Sizes, const SCEV *ElementSize) const { if (Terms.size() < 1 || !ElementSize) return; // Early return when Terms do not contain parameters: we do not delinearize // non parametric SCEVs. if (!containsParameters(Terms)) return; DEBUG({ dbgs() << "Terms:\n"; for (const SCEV *T : Terms) dbgs() << *T << "\n"; }); // Remove duplicates. std::sort(Terms.begin(), Terms.end()); Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end()); // Put larger terms first. std::sort(Terms.begin(), Terms.end(), [](const SCEV *LHS, const SCEV *RHS) { return numberOfTerms(LHS) > numberOfTerms(RHS); }); ScalarEvolution &SE = *const_cast(this); // Try to divide all terms by the element size. If term is not divisible by // element size, proceed with the original term. for (const SCEV *&Term : Terms) { const SCEV *Q, *R; SCEVDivision::divide(SE, Term, ElementSize, &Q, &R); if (!Q->isZero()) Term = Q; } SmallVector NewTerms; // Remove constant factors. for (const SCEV *T : Terms) if (const SCEV *NewT = removeConstantFactors(SE, T)) NewTerms.push_back(NewT); DEBUG({ dbgs() << "Terms after sorting:\n"; for (const SCEV *T : NewTerms) dbgs() << *T << "\n"; }); if (NewTerms.empty() || !findArrayDimensionsRec(SE, NewTerms, Sizes)) { Sizes.clear(); return; } // The last element to be pushed into Sizes is the size of an element. Sizes.push_back(ElementSize); DEBUG({ dbgs() << "Sizes:\n"; for (const SCEV *S : Sizes) dbgs() << *S << "\n"; }); } void ScalarEvolution::computeAccessFunctions( const SCEV *Expr, SmallVectorImpl &Subscripts, SmallVectorImpl &Sizes) { // Early exit in case this SCEV is not an affine multivariate function. if (Sizes.empty()) return; if (auto *AR = dyn_cast(Expr)) if (!AR->isAffine()) return; const SCEV *Res = Expr; int Last = Sizes.size() - 1; for (int i = Last; i >= 0; i--) { const SCEV *Q, *R; SCEVDivision::divide(*this, Res, Sizes[i], &Q, &R); DEBUG({ dbgs() << "Res: " << *Res << "\n"; dbgs() << "Sizes[i]: " << *Sizes[i] << "\n"; dbgs() << "Res divided by Sizes[i]:\n"; dbgs() << "Quotient: " << *Q << "\n"; dbgs() << "Remainder: " << *R << "\n"; }); Res = Q; // Do not record the last subscript corresponding to the size of elements in // the array. if (i == Last) { // Bail out if the remainder is too complex. if (isa(R)) { Subscripts.clear(); Sizes.clear(); return; } continue; } // Record the access function for the current subscript. Subscripts.push_back(R); } // Also push in last position the remainder of the last division: it will be // the access function of the innermost dimension. Subscripts.push_back(Res); std::reverse(Subscripts.begin(), Subscripts.end()); DEBUG({ dbgs() << "Subscripts:\n"; for (const SCEV *S : Subscripts) dbgs() << *S << "\n"; }); } /// Splits the SCEV into two vectors of SCEVs representing the subscripts and /// sizes of an array access. Returns the remainder of the delinearization that /// is the offset start of the array. The SCEV->delinearize algorithm computes /// the multiples of SCEV coefficients: that is a pattern matching of sub /// expressions in the stride and base of a SCEV corresponding to the /// computation of a GCD (greatest common divisor) of base and stride. When /// SCEV->delinearize fails, it returns the SCEV unchanged. /// /// For example: when analyzing the memory access A[i][j][k] in this loop nest /// /// void foo(long n, long m, long o, double A[n][m][o]) { /// /// for (long i = 0; i < n; i++) /// for (long j = 0; j < m; j++) /// for (long k = 0; k < o; k++) /// A[i][j][k] = 1.0; /// } /// /// the delinearization input is the following AddRec SCEV: /// /// AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k> /// /// From this SCEV, we are able to say that the base offset of the access is %A /// because it appears as an offset that does not divide any of the strides in /// the loops: /// /// CHECK: Base offset: %A /// /// and then SCEV->delinearize determines the size of some of the dimensions of /// the array as these are the multiples by which the strides are happening: /// /// CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes. /// /// Note that the outermost dimension remains of UnknownSize because there are /// no strides that would help identifying the size of the last dimension: when /// the array has been statically allocated, one could compute the size of that /// dimension by dividing the overall size of the array by the size of the known /// dimensions: %m * %o * 8. /// /// Finally delinearize provides the access functions for the array reference /// that does correspond to A[i][j][k] of the above C testcase: /// /// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>] /// /// The testcases are checking the output of a function pass: /// DelinearizationPass that walks through all loads and stores of a function /// asking for the SCEV of the memory access with respect to all enclosing /// loops, calling SCEV->delinearize on that and printing the results. void ScalarEvolution::delinearize(const SCEV *Expr, SmallVectorImpl &Subscripts, SmallVectorImpl &Sizes, const SCEV *ElementSize) { // First step: collect parametric terms. SmallVector Terms; collectParametricTerms(Expr, Terms); if (Terms.empty()) return; // Second step: find subscript sizes. findArrayDimensions(Terms, Sizes, ElementSize); if (Sizes.empty()) return; // Third step: compute the access functions for each subscript. computeAccessFunctions(Expr, Subscripts, Sizes); if (Subscripts.empty()) return; DEBUG({ dbgs() << "succeeded to delinearize " << *Expr << "\n"; dbgs() << "ArrayDecl[UnknownSize]"; for (const SCEV *S : Sizes) dbgs() << "[" << *S << "]"; dbgs() << "\nArrayRef"; for (const SCEV *S : Subscripts) dbgs() << "[" << *S << "]"; dbgs() << "\n"; }); } //===----------------------------------------------------------------------===// // SCEVCallbackVH Class Implementation //===----------------------------------------------------------------------===// void ScalarEvolution::SCEVCallbackVH::deleted() { assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!"); if (PHINode *PN = dyn_cast(getValPtr())) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(getValPtr()); // this now dangles! } void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) { assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!"); // Forget all the expressions associated with users of the old value, // so that future queries will recompute the expressions using the new // value. Value *Old = getValPtr(); SmallVector Worklist(Old->user_begin(), Old->user_end()); SmallPtrSet Visited; while (!Worklist.empty()) { User *U = Worklist.pop_back_val(); // Deleting the Old value will cause this to dangle. Postpone // that until everything else is done. if (U == Old) continue; if (!Visited.insert(U).second) continue; if (PHINode *PN = dyn_cast(U)) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(U); Worklist.insert(Worklist.end(), U->user_begin(), U->user_end()); } // Delete the Old value. if (PHINode *PN = dyn_cast(Old)) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(Old); // this now dangles! } ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se) : CallbackVH(V), SE(se) {} //===----------------------------------------------------------------------===// // ScalarEvolution Class Implementation //===----------------------------------------------------------------------===// ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, LoopInfo &LI) : F(F), TLI(TLI), AC(AC), DT(DT), LI(LI), CouldNotCompute(new SCEVCouldNotCompute()), WalkingBEDominatingConds(false), ProvingSplitPredicate(false), ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64), FirstUnknown(nullptr) { // To use guards for proving predicates, we need to scan every instruction in // relevant basic blocks, and not just terminators. Doing this is a waste of // time if the IR does not actually contain any calls to // @llvm.experimental.guard, so do a quick check and remember this beforehand. // // This pessimizes the case where a pass that preserves ScalarEvolution wants // to _add_ guards to the module when there weren't any before, and wants // ScalarEvolution to optimize based on those guards. For now we prefer to be // efficient in lieu of being smart in that rather obscure case. auto *GuardDecl = F.getParent()->getFunction( Intrinsic::getName(Intrinsic::experimental_guard)); HasGuards = GuardDecl && !GuardDecl->use_empty(); } ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg) : F(Arg.F), HasGuards(Arg.HasGuards), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT), LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)), ValueExprMap(std::move(Arg.ValueExprMap)), WalkingBEDominatingConds(false), ProvingSplitPredicate(false), BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)), PredicatedBackedgeTakenCounts( std::move(Arg.PredicatedBackedgeTakenCounts)), ConstantEvolutionLoopExitValue( std::move(Arg.ConstantEvolutionLoopExitValue)), ValuesAtScopes(std::move(Arg.ValuesAtScopes)), LoopDispositions(std::move(Arg.LoopDispositions)), BlockDispositions(std::move(Arg.BlockDispositions)), UnsignedRanges(std::move(Arg.UnsignedRanges)), SignedRanges(std::move(Arg.SignedRanges)), UniqueSCEVs(std::move(Arg.UniqueSCEVs)), UniquePreds(std::move(Arg.UniquePreds)), SCEVAllocator(std::move(Arg.SCEVAllocator)), FirstUnknown(Arg.FirstUnknown) { Arg.FirstUnknown = nullptr; } ScalarEvolution::~ScalarEvolution() { // Iterate through all the SCEVUnknown instances and call their // destructors, so that they release their references to their values. for (SCEVUnknown *U = FirstUnknown; U;) { SCEVUnknown *Tmp = U; U = U->Next; Tmp->~SCEVUnknown(); } FirstUnknown = nullptr; ExprValueMap.clear(); ValueExprMap.clear(); HasRecMap.clear(); // Free any extra memory created for ExitNotTakenInfo in the unlikely event // that a loop had multiple computable exits. for (auto &BTCI : BackedgeTakenCounts) BTCI.second.clear(); for (auto &BTCI : PredicatedBackedgeTakenCounts) BTCI.second.clear(); assert(PendingLoopPredicates.empty() && "isImpliedCond garbage"); assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!"); assert(!ProvingSplitPredicate && "ProvingSplitPredicate garbage!"); } bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) { return !isa(getBackedgeTakenCount(L)); } static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, const Loop *L) { // Print all inner loops first for (Loop *I : *L) PrintLoopInfo(OS, SE, I); OS << "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; SmallVector ExitBlocks; L->getExitBlocks(ExitBlocks); if (ExitBlocks.size() != 1) OS << " "; if (SE->hasLoopInvariantBackedgeTakenCount(L)) { OS << "backedge-taken count is " << *SE->getBackedgeTakenCount(L); } else { OS << "Unpredictable backedge-taken count. "; } OS << "\n" "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; if (!isa(SE->getMaxBackedgeTakenCount(L))) { OS << "max backedge-taken count is " << *SE->getMaxBackedgeTakenCount(L); } else { OS << "Unpredictable max backedge-taken count. "; } OS << "\n" "Loop "; L->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": "; SCEVUnionPredicate Pred; auto PBT = SE->getPredicatedBackedgeTakenCount(L, Pred); if (!isa(PBT)) { OS << "Predicated backedge-taken count is " << *PBT << "\n"; OS << " Predicates:\n"; Pred.print(OS, 4); } else { OS << "Unpredictable predicated backedge-taken count. "; } OS << "\n"; } static StringRef loopDispositionToStr(ScalarEvolution::LoopDisposition LD) { switch (LD) { case ScalarEvolution::LoopVariant: return "Variant"; case ScalarEvolution::LoopInvariant: return "Invariant"; case ScalarEvolution::LoopComputable: return "Computable"; } llvm_unreachable("Unknown ScalarEvolution::LoopDisposition kind!"); } void ScalarEvolution::print(raw_ostream &OS) const { // ScalarEvolution's implementation of the print method is to print // out SCEV values of all instructions that are interesting. Doing // this potentially causes it to create new SCEV objects though, // which technically conflicts with the const qualifier. This isn't // observable from outside the class though, so casting away the // const isn't dangerous. ScalarEvolution &SE = *const_cast(this); OS << "Classifying expressions for: "; F.printAsOperand(OS, /*PrintType=*/false); OS << "\n"; for (Instruction &I : instructions(F)) if (isSCEVable(I.getType()) && !isa(I)) { OS << I << '\n'; OS << " --> "; const SCEV *SV = SE.getSCEV(&I); SV->print(OS); if (!isa(SV)) { OS << " U: "; SE.getUnsignedRange(SV).print(OS); OS << " S: "; SE.getSignedRange(SV).print(OS); } const Loop *L = LI.getLoopFor(I.getParent()); const SCEV *AtUse = SE.getSCEVAtScope(SV, L); if (AtUse != SV) { OS << " --> "; AtUse->print(OS); if (!isa(AtUse)) { OS << " U: "; SE.getUnsignedRange(AtUse).print(OS); OS << " S: "; SE.getSignedRange(AtUse).print(OS); } } if (L) { OS << "\t\t" "Exits: "; const SCEV *ExitValue = SE.getSCEVAtScope(SV, L->getParentLoop()); if (!SE.isLoopInvariant(ExitValue, L)) { OS << "<>"; } else { OS << *ExitValue; } bool First = true; for (auto *Iter = L; Iter; Iter = Iter->getParentLoop()) { if (First) { OS << "\t\t" "LoopDispositions: { "; First = false; } else { OS << ", "; } Iter->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, Iter)); } for (auto *InnerL : depth_first(L)) { if (InnerL == L) continue; if (First) { OS << "\t\t" "LoopDispositions: { "; First = false; } else { OS << ", "; } InnerL->getHeader()->printAsOperand(OS, /*PrintType=*/false); OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, InnerL)); } OS << " }"; } OS << "\n"; } OS << "Determining loop execution counts for: "; F.printAsOperand(OS, /*PrintType=*/false); OS << "\n"; for (Loop *I : LI) PrintLoopInfo(OS, &SE, I); } ScalarEvolution::LoopDisposition ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) { auto &Values = LoopDispositions[S]; for (auto &V : Values) { if (V.getPointer() == L) return V.getInt(); } Values.emplace_back(L, LoopVariant); LoopDisposition D = computeLoopDisposition(S, L); auto &Values2 = LoopDispositions[S]; for (auto &V : make_range(Values2.rbegin(), Values2.rend())) { if (V.getPointer() == L) { V.setInt(D); break; } } return D; } ScalarEvolution::LoopDisposition ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { switch (static_cast(S->getSCEVType())) { case scConstant: return LoopInvariant; case scTruncate: case scZeroExtend: case scSignExtend: return getLoopDisposition(cast(S)->getOperand(), L); case scAddRecExpr: { const SCEVAddRecExpr *AR = cast(S); // If L is the addrec's loop, it's computable. if (AR->getLoop() == L) return LoopComputable; // Add recurrences are never invariant in the function-body (null loop). if (!L) return LoopVariant; // This recurrence is variant w.r.t. L if L contains AR's loop. if (L->contains(AR->getLoop())) return LoopVariant; // This recurrence is invariant w.r.t. L if AR's loop contains L. if (AR->getLoop()->contains(L)) return LoopInvariant; // This recurrence is variant w.r.t. L if any of its operands // are variant. for (auto *Op : AR->operands()) if (!isLoopInvariant(Op, L)) return LoopVariant; // Otherwise it's loop-invariant. return LoopInvariant; } case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: { bool HasVarying = false; for (auto *Op : cast(S)->operands()) { LoopDisposition D = getLoopDisposition(Op, L); if (D == LoopVariant) return LoopVariant; if (D == LoopComputable) HasVarying = true; } return HasVarying ? LoopComputable : LoopInvariant; } case scUDivExpr: { const SCEVUDivExpr *UDiv = cast(S); LoopDisposition LD = getLoopDisposition(UDiv->getLHS(), L); if (LD == LoopVariant) return LoopVariant; LoopDisposition RD = getLoopDisposition(UDiv->getRHS(), L); if (RD == LoopVariant) return LoopVariant; return (LD == LoopInvariant && RD == LoopInvariant) ? LoopInvariant : LoopComputable; } case scUnknown: // All non-instruction values are loop invariant. All instructions are loop // invariant if they are not contained in the specified loop. // Instructions are never considered invariant in the function body // (null loop) because they are defined within the "loop". if (auto *I = dyn_cast(cast(S)->getValue())) return (L && !L->contains(I)) ? LoopInvariant : LoopVariant; return LoopInvariant; case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } bool ScalarEvolution::isLoopInvariant(const SCEV *S, const Loop *L) { return getLoopDisposition(S, L) == LoopInvariant; } bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) { return getLoopDisposition(S, L) == LoopComputable; } ScalarEvolution::BlockDisposition ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) { auto &Values = BlockDispositions[S]; for (auto &V : Values) { if (V.getPointer() == BB) return V.getInt(); } Values.emplace_back(BB, DoesNotDominateBlock); BlockDisposition D = computeBlockDisposition(S, BB); auto &Values2 = BlockDispositions[S]; for (auto &V : make_range(Values2.rbegin(), Values2.rend())) { if (V.getPointer() == BB) { V.setInt(D); break; } } return D; } ScalarEvolution::BlockDisposition ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { switch (static_cast(S->getSCEVType())) { case scConstant: return ProperlyDominatesBlock; case scTruncate: case scZeroExtend: case scSignExtend: return getBlockDisposition(cast(S)->getOperand(), BB); case scAddRecExpr: { // This uses a "dominates" query instead of "properly dominates" query // to test for proper dominance too, because the instruction which // produces the addrec's value is a PHI, and a PHI effectively properly // dominates its entire containing block. const SCEVAddRecExpr *AR = cast(S); if (!DT.dominates(AR->getLoop()->getHeader(), BB)) return DoesNotDominateBlock; } // FALL THROUGH into SCEVNAryExpr handling. case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: { const SCEVNAryExpr *NAry = cast(S); bool Proper = true; for (const SCEV *NAryOp : NAry->operands()) { BlockDisposition D = getBlockDisposition(NAryOp, BB); if (D == DoesNotDominateBlock) return DoesNotDominateBlock; if (D == DominatesBlock) Proper = false; } return Proper ? ProperlyDominatesBlock : DominatesBlock; } case scUDivExpr: { const SCEVUDivExpr *UDiv = cast(S); const SCEV *LHS = UDiv->getLHS(), *RHS = UDiv->getRHS(); BlockDisposition LD = getBlockDisposition(LHS, BB); if (LD == DoesNotDominateBlock) return DoesNotDominateBlock; BlockDisposition RD = getBlockDisposition(RHS, BB); if (RD == DoesNotDominateBlock) return DoesNotDominateBlock; return (LD == ProperlyDominatesBlock && RD == ProperlyDominatesBlock) ? ProperlyDominatesBlock : DominatesBlock; } case scUnknown: if (Instruction *I = dyn_cast(cast(S)->getValue())) { if (I->getParent() == BB) return DominatesBlock; if (DT.properlyDominates(I->getParent(), BB)) return ProperlyDominatesBlock; return DoesNotDominateBlock; } return ProperlyDominatesBlock; case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); } llvm_unreachable("Unknown SCEV kind!"); } bool ScalarEvolution::dominates(const SCEV *S, const BasicBlock *BB) { return getBlockDisposition(S, BB) >= DominatesBlock; } bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) { return getBlockDisposition(S, BB) == ProperlyDominatesBlock; } bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { // Search for a SCEV expression node within an expression tree. // Implements SCEVTraversal::Visitor. struct SCEVSearch { const SCEV *Node; bool IsFound; SCEVSearch(const SCEV *N): Node(N), IsFound(false) {} bool follow(const SCEV *S) { IsFound |= (S == Node); return !IsFound; } bool isDone() const { return IsFound; } }; SCEVSearch Search(Op); visitAll(S, Search); return Search.IsFound; } void ScalarEvolution::forgetMemoizedResults(const SCEV *S) { ValuesAtScopes.erase(S); LoopDispositions.erase(S); BlockDispositions.erase(S); UnsignedRanges.erase(S); SignedRanges.erase(S); ExprValueMap.erase(S); HasRecMap.erase(S); auto RemoveSCEVFromBackedgeMap = [S, this](DenseMap &Map) { for (auto I = Map.begin(), E = Map.end(); I != E;) { BackedgeTakenInfo &BEInfo = I->second; if (BEInfo.hasOperand(S, this)) { BEInfo.clear(); Map.erase(I++); } else ++I; } }; RemoveSCEVFromBackedgeMap(BackedgeTakenCounts); RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts); } typedef DenseMap VerifyMap; /// replaceSubString - Replaces all occurrences of From in Str with To. static void replaceSubString(std::string &Str, StringRef From, StringRef To) { size_t Pos = 0; while ((Pos = Str.find(From, Pos)) != std::string::npos) { Str.replace(Pos, From.size(), To.data(), To.size()); Pos += To.size(); } } /// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis. static void getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) { std::string &S = Map[L]; if (S.empty()) { raw_string_ostream OS(S); SE.getBackedgeTakenCount(L)->print(OS); // false and 0 are semantically equivalent. This can happen in dead loops. replaceSubString(OS.str(), "false", "0"); // Remove wrap flags, their use in SCEV is highly fragile. // FIXME: Remove this when SCEV gets smarter about them. replaceSubString(OS.str(), "", ""); replaceSubString(OS.str(), "", ""); replaceSubString(OS.str(), "", ""); } for (auto *R : reverse(*L)) getLoopBackedgeTakenCounts(R, Map, SE); // recurse. } void ScalarEvolution::verify() const { ScalarEvolution &SE = *const_cast(this); // Gather stringified backedge taken counts for all loops using SCEV's caches. // FIXME: It would be much better to store actual values instead of strings, // but SCEV pointers will change if we drop the caches. VerifyMap BackedgeDumpsOld, BackedgeDumpsNew; for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I) getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE); // Gather stringified backedge taken counts for all loops using a fresh // ScalarEvolution object. ScalarEvolution SE2(F, TLI, AC, DT, LI); for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I) getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE2); // Now compare whether they're the same with and without caches. This allows // verifying that no pass changed the cache. assert(BackedgeDumpsOld.size() == BackedgeDumpsNew.size() && "New loops suddenly appeared!"); for (VerifyMap::iterator OldI = BackedgeDumpsOld.begin(), OldE = BackedgeDumpsOld.end(), NewI = BackedgeDumpsNew.begin(); OldI != OldE; ++OldI, ++NewI) { assert(OldI->first == NewI->first && "Loop order changed!"); // Compare the stringified SCEVs. We don't care if undef backedgetaken count // changes. // FIXME: We currently ignore SCEV changes from/to CouldNotCompute. This // means that a pass is buggy or SCEV has to learn a new pattern but is // usually not harmful. if (OldI->second != NewI->second && OldI->second.find("undef") == std::string::npos && NewI->second.find("undef") == std::string::npos && OldI->second != "***COULDNOTCOMPUTE***" && NewI->second != "***COULDNOTCOMPUTE***") { dbgs() << "SCEVValidator: SCEV for loop '" << OldI->first->getHeader()->getName() << "' changed from '" << OldI->second << "' to '" << NewI->second << "'!\n"; std::abort(); } } // TODO: Verify more things. } char ScalarEvolutionAnalysis::PassID; ScalarEvolution ScalarEvolutionAnalysis::run(Function &F, AnalysisManager &AM) { return ScalarEvolution(F, AM.getResult(F), AM.getResult(F), AM.getResult(F), AM.getResult(F)); } PreservedAnalyses ScalarEvolutionPrinterPass::run(Function &F, AnalysisManager &AM) { AM.getResult(F).print(OS); return PreservedAnalyses::all(); } INITIALIZE_PASS_BEGIN(ScalarEvolutionWrapperPass, "scalar-evolution", "Scalar Evolution Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ScalarEvolutionWrapperPass, "scalar-evolution", "Scalar Evolution Analysis", false, true) char ScalarEvolutionWrapperPass::ID = 0; ScalarEvolutionWrapperPass::ScalarEvolutionWrapperPass() : FunctionPass(ID) { initializeScalarEvolutionWrapperPassPass(*PassRegistry::getPassRegistry()); } bool ScalarEvolutionWrapperPass::runOnFunction(Function &F) { SE.reset(new ScalarEvolution( F, getAnalysis().getTLI(), getAnalysis().getAssumptionCache(F), getAnalysis().getDomTree(), getAnalysis().getLoopInfo())); return false; } void ScalarEvolutionWrapperPass::releaseMemory() { SE.reset(); } void ScalarEvolutionWrapperPass::print(raw_ostream &OS, const Module *) const { SE->print(OS); } void ScalarEvolutionWrapperPass::verifyAnalysis() const { if (!VerifySCEV) return; SE->verify(); } void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); } const SCEVPredicate * ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS, const SCEVConstant *RHS) { FoldingSetNodeID ID; // Unique this node based on the arguments ID.AddInteger(SCEVPredicate::P_Equal); ID.AddPointer(LHS); ID.AddPointer(RHS); void *IP = nullptr; if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP)) return S; SCEVEqualPredicate *Eq = new (SCEVAllocator) SCEVEqualPredicate(ID.Intern(SCEVAllocator), LHS, RHS); UniquePreds.InsertNode(Eq, IP); return Eq; } const SCEVPredicate *ScalarEvolution::getWrapPredicate( const SCEVAddRecExpr *AR, SCEVWrapPredicate::IncrementWrapFlags AddedFlags) { FoldingSetNodeID ID; // Unique this node based on the arguments ID.AddInteger(SCEVPredicate::P_Wrap); ID.AddPointer(AR); ID.AddInteger(AddedFlags); void *IP = nullptr; if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP)) return S; auto *OF = new (SCEVAllocator) SCEVWrapPredicate(ID.Intern(SCEVAllocator), AR, AddedFlags); UniquePreds.InsertNode(OF, IP); return OF; } namespace { class SCEVPredicateRewriter : public SCEVRewriteVisitor { public: // Rewrites \p S in the context of a loop L and the predicate A. // If Assume is true, rewrite is free to add further predicates to A // such that the result will be an AddRecExpr. static const SCEV *rewrite(const SCEV *S, const Loop *L, ScalarEvolution &SE, SCEVUnionPredicate &A, bool Assume) { SCEVPredicateRewriter Rewriter(L, SE, A, Assume); return Rewriter.visit(S); } SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE, SCEVUnionPredicate &P, bool Assume) : SCEVRewriteVisitor(SE), P(P), L(L), Assume(Assume) {} const SCEV *visitUnknown(const SCEVUnknown *Expr) { auto ExprPreds = P.getPredicatesForExpr(Expr); for (auto *Pred : ExprPreds) if (const auto *IPred = dyn_cast(Pred)) if (IPred->getLHS() == Expr) return IPred->getRHS(); return Expr; } const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { const SCEV *Operand = visit(Expr->getOperand()); const SCEVAddRecExpr *AR = dyn_cast(Operand); if (AR && AR->getLoop() == L && AR->isAffine()) { // This couldn't be folded because the operand didn't have the nuw // flag. Add the nusw flag as an assumption that we could make. const SCEV *Step = AR->getStepRecurrence(SE); Type *Ty = Expr->getType(); if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNUSW)) return SE.getAddRecExpr(SE.getZeroExtendExpr(AR->getStart(), Ty), SE.getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } return SE.getZeroExtendExpr(Operand, Expr->getType()); } const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { const SCEV *Operand = visit(Expr->getOperand()); const SCEVAddRecExpr *AR = dyn_cast(Operand); if (AR && AR->getLoop() == L && AR->isAffine()) { // This couldn't be folded because the operand didn't have the nsw // flag. Add the nssw flag as an assumption that we could make. const SCEV *Step = AR->getStepRecurrence(SE); Type *Ty = Expr->getType(); if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNSSW)) return SE.getAddRecExpr(SE.getSignExtendExpr(AR->getStart(), Ty), SE.getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } return SE.getSignExtendExpr(Operand, Expr->getType()); } private: bool addOverflowAssumption(const SCEVAddRecExpr *AR, SCEVWrapPredicate::IncrementWrapFlags AddedFlags) { auto *A = SE.getWrapPredicate(AR, AddedFlags); if (!Assume) { // Check if we've already made this assumption. if (P.implies(A)) return true; return false; } P.add(A); return true; } SCEVUnionPredicate &P; const Loop *L; bool Assume; }; } // end anonymous namespace const SCEV *ScalarEvolution::rewriteUsingPredicate(const SCEV *S, const Loop *L, SCEVUnionPredicate &Preds) { return SCEVPredicateRewriter::rewrite(S, L, *this, Preds, false); } const SCEVAddRecExpr * ScalarEvolution::convertSCEVToAddRecWithPredicates(const SCEV *S, const Loop *L, SCEVUnionPredicate &Preds) { SCEVUnionPredicate TransformPreds; S = SCEVPredicateRewriter::rewrite(S, L, *this, TransformPreds, true); auto *AddRec = dyn_cast(S); if (!AddRec) return nullptr; // Since the transformation was successful, we can now transfer the SCEV // predicates. Preds.add(&TransformPreds); return AddRec; } /// SCEV predicates SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID, SCEVPredicateKind Kind) : FastID(ID), Kind(Kind) {} SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEVUnknown *LHS, const SCEVConstant *RHS) : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {} bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const { const auto *Op = dyn_cast(N); if (!Op) return false; return Op->LHS == LHS && Op->RHS == RHS; } bool SCEVEqualPredicate::isAlwaysTrue() const { return false; } const SCEV *SCEVEqualPredicate::getExpr() const { return LHS; } void SCEVEqualPredicate::print(raw_ostream &OS, unsigned Depth) const { OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n"; } SCEVWrapPredicate::SCEVWrapPredicate(const FoldingSetNodeIDRef ID, const SCEVAddRecExpr *AR, IncrementWrapFlags Flags) : SCEVPredicate(ID, P_Wrap), AR(AR), Flags(Flags) {} const SCEV *SCEVWrapPredicate::getExpr() const { return AR; } bool SCEVWrapPredicate::implies(const SCEVPredicate *N) const { const auto *Op = dyn_cast(N); return Op && Op->AR == AR && setFlags(Flags, Op->Flags) == Flags; } bool SCEVWrapPredicate::isAlwaysTrue() const { SCEV::NoWrapFlags ScevFlags = AR->getNoWrapFlags(); IncrementWrapFlags IFlags = Flags; if (ScalarEvolution::setFlags(ScevFlags, SCEV::FlagNSW) == ScevFlags) IFlags = clearFlags(IFlags, IncrementNSSW); return IFlags == IncrementAnyWrap; } void SCEVWrapPredicate::print(raw_ostream &OS, unsigned Depth) const { OS.indent(Depth) << *getExpr() << " Added Flags: "; if (SCEVWrapPredicate::IncrementNUSW & getFlags()) OS << ""; if (SCEVWrapPredicate::IncrementNSSW & getFlags()) OS << ""; OS << "\n"; } SCEVWrapPredicate::IncrementWrapFlags SCEVWrapPredicate::getImpliedFlags(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { IncrementWrapFlags ImpliedFlags = IncrementAnyWrap; SCEV::NoWrapFlags StaticFlags = AR->getNoWrapFlags(); // We can safely transfer the NSW flag as NSSW. if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNSW) == StaticFlags) ImpliedFlags = IncrementNSSW; if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNUW) == StaticFlags) { // If the increment is positive, the SCEV NUW flag will also imply the // WrapPredicate NUSW flag. if (const auto *Step = dyn_cast(AR->getStepRecurrence(SE))) if (Step->getValue()->getValue().isNonNegative()) ImpliedFlags = setFlags(ImpliedFlags, IncrementNUSW); } return ImpliedFlags; } /// Union predicates don't get cached so create a dummy set ID for it. SCEVUnionPredicate::SCEVUnionPredicate() : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {} bool SCEVUnionPredicate::isAlwaysTrue() const { return all_of(Preds, [](const SCEVPredicate *I) { return I->isAlwaysTrue(); }); } ArrayRef SCEVUnionPredicate::getPredicatesForExpr(const SCEV *Expr) { auto I = SCEVToPreds.find(Expr); if (I == SCEVToPreds.end()) return ArrayRef(); return I->second; } bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const { if (const auto *Set = dyn_cast(N)) return all_of(Set->Preds, [this](const SCEVPredicate *I) { return this->implies(I); }); auto ScevPredsIt = SCEVToPreds.find(N->getExpr()); if (ScevPredsIt == SCEVToPreds.end()) return false; auto &SCEVPreds = ScevPredsIt->second; return any_of(SCEVPreds, [N](const SCEVPredicate *I) { return I->implies(N); }); } const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; } void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const { for (auto Pred : Preds) Pred->print(OS, Depth); } void SCEVUnionPredicate::add(const SCEVPredicate *N) { if (const auto *Set = dyn_cast(N)) { for (auto Pred : Set->Preds) add(Pred); return; } if (implies(N)) return; const SCEV *Key = N->getExpr(); assert(Key && "Only SCEVUnionPredicate doesn't have an " " associated expression!"); SCEVToPreds[Key].push_back(N); Preds.push_back(N); } PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE, Loop &L) : SE(SE), L(L), Generation(0), BackedgeCount(nullptr) {} const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) { const SCEV *Expr = SE.getSCEV(V); RewriteEntry &Entry = RewriteMap[Expr]; // If we already have an entry and the version matches, return it. if (Entry.second && Generation == Entry.first) return Entry.second; // We found an entry but it's stale. Rewrite the stale entry // acording to the current predicate. if (Entry.second) Expr = Entry.second; const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, &L, Preds); Entry = {Generation, NewSCEV}; return NewSCEV; } const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() { if (!BackedgeCount) { SCEVUnionPredicate BackedgePred; BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, BackedgePred); addPredicate(BackedgePred); } return BackedgeCount; } void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds.implies(&Pred)) return; Preds.add(&Pred); updateGeneration(); } const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const { return Preds; } void PredicatedScalarEvolution::updateGeneration() { // If the generation number wrapped recompute everything. if (++Generation == 0) { for (auto &II : RewriteMap) { const SCEV *Rewritten = II.second.second; II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, &L, Preds)}; } } } void PredicatedScalarEvolution::setNoOverflow( Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) { const SCEV *Expr = getSCEV(V); const auto *AR = cast(Expr); auto ImpliedFlags = SCEVWrapPredicate::getImpliedFlags(AR, SE); // Clear the statically implied flags. Flags = SCEVWrapPredicate::clearFlags(Flags, ImpliedFlags); addPredicate(*SE.getWrapPredicate(AR, Flags)); auto II = FlagsMap.insert({V, Flags}); if (!II.second) II.first->second = SCEVWrapPredicate::setFlags(Flags, II.first->second); } bool PredicatedScalarEvolution::hasNoOverflow( Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) { const SCEV *Expr = getSCEV(V); const auto *AR = cast(Expr); Flags = SCEVWrapPredicate::clearFlags( Flags, SCEVWrapPredicate::getImpliedFlags(AR, SE)); auto II = FlagsMap.find(V); if (II != FlagsMap.end()) Flags = SCEVWrapPredicate::clearFlags(Flags, II->second); return Flags == SCEVWrapPredicate::IncrementAnyWrap; } const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) { const SCEV *Expr = this->getSCEV(V); auto *New = SE.convertSCEVToAddRecWithPredicates(Expr, &L, Preds); if (!New) return nullptr; updateGeneration(); RewriteMap[SE.getSCEV(V)] = {Generation, New}; return New; } PredicatedScalarEvolution::PredicatedScalarEvolution( const PredicatedScalarEvolution &Init) : RewriteMap(Init.RewriteMap), SE(Init.SE), L(Init.L), Preds(Init.Preds), Generation(Init.Generation), BackedgeCount(Init.BackedgeCount) { for (const auto &I : Init.FlagsMap) FlagsMap.insert(I); } void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const { // For each block. for (auto *BB : L.getBlocks()) for (auto &I : *BB) { if (!SE.isSCEVable(I.getType())) continue; auto *Expr = SE.getSCEV(&I); auto II = RewriteMap.find(Expr); if (II == RewriteMap.end()) continue; // Don't print things that are not interesting. if (II->second.second == Expr) continue; OS.indent(Depth) << "[PSE]" << I << ":\n"; OS.indent(Depth + 2) << *Expr << "\n"; OS.indent(Depth + 2) << "--> " << *II->second.second << "\n"; } } Index: projects/clang390-import/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- projects/clang390-import/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (revision 304769) +++ projects/clang390-import/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (revision 304770) @@ -1,1950 +1,1953 @@ //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains a pass that performs load / store related peephole // optimizations. This pass should be run after register allocation. // //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); // The LdStLimit limits how far we search for load/store pairs. static cl::opt LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); // The UpdateLimit limits how far we search for update instructions when we form // pre-/post-index instructions. static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); static cl::opt EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden, cl::init(false), cl::desc("Enable narrow load merge")); namespace llvm { void initializeAArch64LoadStoreOptPass(PassRegistry &); } #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { typedef struct LdStPairFlags { // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. bool MergeForward; // SExtIdx gives the index of the result of the load pair that must be // extended. The value of SExtIdx assumes that the paired load produces the // value in this order: (I, returned iterator), i.e., -1 means no value has // to be extended, 0 means I, and 1 means the returned iterator. int SExtIdx; LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} void setMergeForward(bool V = true) { MergeForward = V; } bool getMergeForward() const { return MergeForward; } void setSExtIdx(int V) { SExtIdx = V; } int getSExtIdx() const { return SExtIdx; } } LdStPairFlags; struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; AArch64LoadStoreOpt() : MachineFunctionPass(ID) { initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); } const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; const AArch64Subtarget *Subtarget; // Track which registers have been modified and used. BitVector ModifiedRegs, UsedRegs; // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, LdStPairFlags &Flags, unsigned Limit, bool FindNarrowMerge); // Scan the instructions looking for a store that writes to the address from // which the current load instruction reads. Return true if one is found. bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI); // Merge the two instructions indicated into a wider instruction. MachineBasicBlock::iterator mergeNarrowInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, const LdStPairFlags &Flags); // Merge the two instructions indicated into a single pair-wise instruction. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, const LdStPairFlags &Flags); // Promote the load that reads directly from the address stored to. MachineBasicBlock::iterator promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator StoreI); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan backwards. MachineBasicBlock::iterator findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); // Find an instruction that updates the base register of the ld/st // instruction. bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); // Find and merge foldable ldr/str instructions. bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); // Find and pair ldr/str instructions. bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::AllVRegsAllocated); } const char *getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } }; char AArch64LoadStoreOpt::ID = 0; } // namespace INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) static unsigned getBitExtrOpcode(MachineInstr &MI) { switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode."); case AArch64::LDRBBui: case AArch64::LDURBBi: case AArch64::LDRHHui: case AArch64::LDURHHi: return AArch64::UBFMWri; case AArch64::LDRSBWui: case AArch64::LDURSBWi: case AArch64::LDRSHWui: case AArch64::LDURSHWi: return AArch64::SBFMWri; } } static bool isNarrowStore(unsigned Opc) { switch (Opc) { default: return false; case AArch64::STRBBui: case AArch64::STURBBi: case AArch64::STRHHui: case AArch64::STURHHi: return true; } } static bool isNarrowLoad(unsigned Opc) { switch (Opc) { default: return false; case AArch64::LDRHHui: case AArch64::LDURHHi: case AArch64::LDRBBui: case AArch64::LDURBBi: case AArch64::LDRSHWui: case AArch64::LDURSHWi: case AArch64::LDRSBWui: case AArch64::LDURSBWi: return true; } } static bool isNarrowLoad(MachineInstr &MI) { return isNarrowLoad(MI.getOpcode()); } static bool isNarrowLoadOrStore(unsigned Opc) { return isNarrowLoad(Opc) || isNarrowStore(Opc); } // Scaling factor for unscaled load or store. static int getMemScale(MachineInstr &MI) { switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); case AArch64::LDRBBui: case AArch64::LDURBBi: case AArch64::LDRSBWui: case AArch64::LDURSBWi: case AArch64::STRBBui: case AArch64::STURBBi: return 1; case AArch64::LDRHHui: case AArch64::LDURHHi: case AArch64::LDRSHWui: case AArch64::LDURSHWi: case AArch64::STRHHui: case AArch64::STURHHi: return 2; case AArch64::LDRSui: case AArch64::LDURSi: case AArch64::LDRSWui: case AArch64::LDURSWi: case AArch64::LDRWui: case AArch64::LDURWi: case AArch64::STRSui: case AArch64::STURSi: case AArch64::STRWui: case AArch64::STURWi: case AArch64::LDPSi: case AArch64::LDPSWi: case AArch64::LDPWi: case AArch64::STPSi: case AArch64::STPWi: return 4; case AArch64::LDRDui: case AArch64::LDURDi: case AArch64::LDRXui: case AArch64::LDURXi: case AArch64::STRDui: case AArch64::STURDi: case AArch64::STRXui: case AArch64::STURXi: case AArch64::LDPDi: case AArch64::LDPXi: case AArch64::STPDi: case AArch64::STPXi: return 8; case AArch64::LDRQui: case AArch64::LDURQi: case AArch64::STRQui: case AArch64::STURQi: case AArch64::LDPQi: case AArch64::STPQi: return 16; } } static unsigned getMatchingNonSExtOpcode(unsigned Opc, bool *IsValidLdStrOpc = nullptr) { if (IsValidLdStrOpc) *IsValidLdStrOpc = true; switch (Opc) { default: if (IsValidLdStrOpc) *IsValidLdStrOpc = false; return UINT_MAX; case AArch64::STRDui: case AArch64::STURDi: case AArch64::STRQui: case AArch64::STURQi: case AArch64::STRBBui: case AArch64::STURBBi: case AArch64::STRHHui: case AArch64::STURHHi: case AArch64::STRWui: case AArch64::STURWi: case AArch64::STRXui: case AArch64::STURXi: case AArch64::LDRDui: case AArch64::LDURDi: case AArch64::LDRQui: case AArch64::LDURQi: case AArch64::LDRWui: case AArch64::LDURWi: case AArch64::LDRXui: case AArch64::LDURXi: case AArch64::STRSui: case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: case AArch64::LDRHHui: case AArch64::LDURHHi: case AArch64::LDRBBui: case AArch64::LDURBBi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; case AArch64::LDURSWi: return AArch64::LDURWi; case AArch64::LDRSBWui: return AArch64::LDRBBui; case AArch64::LDRSHWui: return AArch64::LDRHHui; case AArch64::LDURSBWi: return AArch64::LDURBBi; case AArch64::LDURSHWi: return AArch64::LDURHHi; } } static unsigned getMatchingWideOpcode(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Opcode has no wide equivalent!"); case AArch64::STRBBui: return AArch64::STRHHui; case AArch64::STRHHui: return AArch64::STRWui; case AArch64::STURBBi: return AArch64::STURHHi; case AArch64::STURHHi: return AArch64::STURWi; case AArch64::STURWi: return AArch64::STURXi; case AArch64::STRWui: return AArch64::STRXui; case AArch64::LDRHHui: case AArch64::LDRSHWui: return AArch64::LDRWui; case AArch64::LDURHHi: case AArch64::LDURSHWi: return AArch64::LDURWi; case AArch64::LDRBBui: case AArch64::LDRSBWui: return AArch64::LDRHHui; case AArch64::LDURBBi: case AArch64::LDURSBWi: return AArch64::LDURHHi; } } static unsigned getMatchingPairOpcode(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Opcode has no pairwise equivalent!"); case AArch64::STRSui: case AArch64::STURSi: return AArch64::STPSi; case AArch64::STRDui: case AArch64::STURDi: return AArch64::STPDi; case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; case AArch64::STRXui: case AArch64::STURXi: return AArch64::STPXi; case AArch64::LDRSui: case AArch64::LDURSi: return AArch64::LDPSi; case AArch64::LDRDui: case AArch64::LDURDi: return AArch64::LDPDi; case AArch64::LDRQui: case AArch64::LDURQi: return AArch64::LDPQi; case AArch64::LDRWui: case AArch64::LDURWi: return AArch64::LDPWi; case AArch64::LDRXui: case AArch64::LDURXi: return AArch64::LDPXi; case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; } } static unsigned isMatchingStore(MachineInstr &LoadInst, MachineInstr &StoreInst) { unsigned LdOpc = LoadInst.getOpcode(); unsigned StOpc = StoreInst.getOpcode(); switch (LdOpc) { default: llvm_unreachable("Unsupported load instruction!"); case AArch64::LDRBBui: return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; case AArch64::LDURBBi: return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; case AArch64::LDRHHui: return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; case AArch64::LDURHHi: return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; case AArch64::LDRWui: return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; case AArch64::LDURWi: return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; case AArch64::LDRXui: return StOpc == AArch64::STRXui; case AArch64::LDURXi: return StOpc == AArch64::STURXi; } } static unsigned getPreIndexedOpcode(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Opcode has no pre-indexed equivalent!"); case AArch64::STRSui: return AArch64::STRSpre; case AArch64::STRDui: return AArch64::STRDpre; case AArch64::STRQui: return AArch64::STRQpre; case AArch64::STRBBui: return AArch64::STRBBpre; case AArch64::STRHHui: return AArch64::STRHHpre; case AArch64::STRWui: return AArch64::STRWpre; case AArch64::STRXui: return AArch64::STRXpre; case AArch64::LDRSui: return AArch64::LDRSpre; case AArch64::LDRDui: return AArch64::LDRDpre; case AArch64::LDRQui: return AArch64::LDRQpre; case AArch64::LDRBBui: return AArch64::LDRBBpre; case AArch64::LDRHHui: return AArch64::LDRHHpre; case AArch64::LDRWui: return AArch64::LDRWpre; case AArch64::LDRXui: return AArch64::LDRXpre; case AArch64::LDRSWui: return AArch64::LDRSWpre; case AArch64::LDPSi: return AArch64::LDPSpre; case AArch64::LDPSWi: return AArch64::LDPSWpre; case AArch64::LDPDi: return AArch64::LDPDpre; case AArch64::LDPQi: return AArch64::LDPQpre; case AArch64::LDPWi: return AArch64::LDPWpre; case AArch64::LDPXi: return AArch64::LDPXpre; case AArch64::STPSi: return AArch64::STPSpre; case AArch64::STPDi: return AArch64::STPDpre; case AArch64::STPQi: return AArch64::STPQpre; case AArch64::STPWi: return AArch64::STPWpre; case AArch64::STPXi: return AArch64::STPXpre; } } static unsigned getPostIndexedOpcode(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Opcode has no post-indexed wise equivalent!"); case AArch64::STRSui: return AArch64::STRSpost; case AArch64::STRDui: return AArch64::STRDpost; case AArch64::STRQui: return AArch64::STRQpost; case AArch64::STRBBui: return AArch64::STRBBpost; case AArch64::STRHHui: return AArch64::STRHHpost; case AArch64::STRWui: return AArch64::STRWpost; case AArch64::STRXui: return AArch64::STRXpost; case AArch64::LDRSui: return AArch64::LDRSpost; case AArch64::LDRDui: return AArch64::LDRDpost; case AArch64::LDRQui: return AArch64::LDRQpost; case AArch64::LDRBBui: return AArch64::LDRBBpost; case AArch64::LDRHHui: return AArch64::LDRHHpost; case AArch64::LDRWui: return AArch64::LDRWpost; case AArch64::LDRXui: return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; case AArch64::LDPSi: return AArch64::LDPSpost; case AArch64::LDPSWi: return AArch64::LDPSWpost; case AArch64::LDPDi: return AArch64::LDPDpost; case AArch64::LDPQi: return AArch64::LDPQpost; case AArch64::LDPWi: return AArch64::LDPWpost; case AArch64::LDPXi: return AArch64::LDPXpost; case AArch64::STPSi: return AArch64::STPSpost; case AArch64::STPDi: return AArch64::STPDpost; case AArch64::STPQi: return AArch64::STPQpost; case AArch64::STPWi: return AArch64::STPWpost; case AArch64::STPXi: return AArch64::STPXpost; } } static bool isPairedLdSt(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return false; case AArch64::LDPSi: case AArch64::LDPSWi: case AArch64::LDPDi: case AArch64::LDPQi: case AArch64::LDPWi: case AArch64::LDPXi: case AArch64::STPSi: case AArch64::STPDi: case AArch64::STPQi: case AArch64::STPWi: case AArch64::STPXi: return true; } } static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; return MI.getOperand(Idx); } static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 2 : 1; return MI.getOperand(Idx); } static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 3 : 2; return MI.getOperand(Idx); } static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, MachineInstr &StoreInst, const AArch64InstrInfo *TII) { assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = getMemScale(LoadInst); int StoreSize = getMemScale(StoreInst); int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) ? getLdStOffsetOp(StoreInst).getImm() : getLdStOffsetOp(StoreInst).getImm() * StoreSize; int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) ? getLdStOffsetOp(LoadInst).getImm() : getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } static bool isPromotableZeroStoreOpcode(unsigned Opc) { return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; } static bool isPromotableZeroStoreOpcode(MachineInstr &MI) { return isPromotableZeroStoreOpcode(MI.getOpcode()); } static bool isPromotableZeroStoreInst(MachineInstr &MI) { return (isPromotableZeroStoreOpcode(MI)) && getLdStRegOp(MI).getReg() == AArch64::WZR; } MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. if (NextI == MergeMI) ++NextI; unsigned Opc = I->getOpcode(); bool IsScaled = !TII->isUnscaledLdSt(Opc); int OffsetStride = IsScaled ? 1 : getMemScale(*I); bool MergeForward = Flags.getMergeForward(); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; if (getLdStOffsetOp(*I).getImm() == getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) { RtMI = &*MergeMI; Rt2MI = &*I; } else { RtMI = &*I; Rt2MI = &*MergeMI; } int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); // Change the scaled offset from small to large type. if (IsScaled) { assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); OffsetImm /= 2; } DebugLoc DL = I->getDebugLoc(); MachineBasicBlock *MBB = I->getParent(); if (isNarrowLoad(Opc)) { MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI); // When merging small (< 32 bit) loads for big-endian targets, the order of // the component parts gets swapped. if (!Subtarget->isLittleEndian()) std::swap(RtMI, Rt2MI); // Construct the new load instruction. MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; NewMemMI = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) .addOperand(getLdStRegOp(*RtNewDest)) .addOperand(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*MergeMI)); (void)NewMemMI; DEBUG( dbgs() << "Creating the new load and extract. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(MergeMI->print(dbgs())); DEBUG(dbgs() << " with instructions:\n "); DEBUG((NewMemMI)->print(dbgs())); int Width = getMemScale(*I) == 1 ? 8 : 16; int LSBLow = 0; int LSBHigh = Width; int ImmsLow = LSBLow + Width - 1; int ImmsHigh = LSBHigh + Width - 1; MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I); if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { // Create the bitfield extract for high bits. BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) .addOperand(getLdStRegOp(*Rt2MI)) .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(LSBHigh) .addImm(ImmsHigh); // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) .addOperand(getLdStRegOp(*RtMI)) .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) .addOperand(getLdStRegOp(*RtMI)) .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(LSBLow) .addImm(ImmsLow); } } else { // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) .addOperand(getLdStRegOp(*RtMI)) .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) .addOperand(getLdStRegOp(*RtMI)) .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(LSBLow) .addImm(ImmsLow); } // Create the bitfield extract for high bits. BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) .addOperand(getLdStRegOp(*Rt2MI)) .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(LSBHigh) .addImm(ImmsHigh); } (void)BitExtMI1; (void)BitExtMI2; DEBUG(dbgs() << " "); DEBUG((BitExtMI1)->print(dbgs())); DEBUG(dbgs() << " "); DEBUG((BitExtMI2)->print(dbgs())); DEBUG(dbgs() << "\n"); // Erase the old instructions. I->eraseFromParent(); MergeMI->eraseFromParent(); return NextI; } assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && "Expected promotable zero store"); // Construct the new instruction. MachineInstrBuilder MIB; MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) .addOperand(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*MergeMI)); (void)MIB; DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(MergeMI->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); DEBUG(((MachineInstr *)MIB)->print(dbgs())); DEBUG(dbgs() << "\n"); // Erase the old instructions. I->eraseFromParent(); MergeMI->eraseFromParent(); return NextI; } MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. if (NextI == Paired) ++NextI; int SExtIdx = Flags.getSExtIdx(); unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); bool IsUnscaled = TII->isUnscaledLdSt(Opc); int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; bool MergeForward = Flags.getMergeForward(); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); int Offset = getLdStOffsetOp(*I).getImm(); int PairedOffset = getLdStOffsetOp(*Paired).getImm(); bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); if (IsUnscaled != PairedIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. If // I is scaled then scale the offset of Paired accordingly. Otherwise, do // the opposite (i.e., make Paired's offset unscaled). int MemSize = getMemScale(*Paired); if (PairedIsUnscaled) { // If the unscaled offset isn't a multiple of the MemSize, we can't // pair the operations together. assert(!(PairedOffset % getMemScale(*Paired)) && "Offset should be a multiple of the stride!"); PairedOffset /= MemSize; } else { PairedOffset *= MemSize; } } // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; if (Offset == PairedOffset + OffsetStride) { RtMI = &*Paired; Rt2MI = &*I; // Here we swapped the assumption made for SExtIdx. // I.e., we turn ldp I, Paired into ldp Paired, I. // Update the index accordingly. if (SExtIdx != -1) SExtIdx = (SExtIdx + 1) % 2; } else { RtMI = &*I; Rt2MI = &*Paired; } int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); // Scale the immediate offset, if necessary. if (TII->isUnscaledLdSt(RtMI->getOpcode())) { assert(!(OffsetImm % getMemScale(*RtMI)) && "Unscaled offset cannot be scaled."); OffsetImm /= getMemScale(*RtMI); } // Construct the new instruction. MachineInstrBuilder MIB; DebugLoc DL = I->getDebugLoc(); MachineBasicBlock *MBB = I->getParent(); MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) .addOperand(getLdStRegOp(*RtMI)) .addOperand(getLdStRegOp(*Rt2MI)) .addOperand(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*Paired)); (void)MIB; DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(Paired->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); if (SExtIdx != -1) { // Generate the sign extension for the proper result of the ldp. // I.e., with X1, that would be: // %W1 = KILL %W1, %X1 // %X1 = SBFMXri %X1, 0, 31 MachineOperand &DstMO = MIB->getOperand(SExtIdx); // Right now, DstMO has the extended register, since it comes from an // extended opcode. unsigned DstRegX = DstMO.getReg(); // Get the W variant of that register. unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); // Update the result of LDP to use the W instead of the X variant. DstMO.setReg(DstRegW); DEBUG(((MachineInstr *)MIB)->print(dbgs())); DEBUG(dbgs() << "\n"); // Make the machine verifier happy by providing a definition for // the X register. // Insert this definition right after the generated LDP, i.e., before // InsertionPoint. MachineInstrBuilder MIBKill = BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) .addReg(DstRegW) .addReg(DstRegX, RegState::Define); MIBKill->getOperand(2).setImplicit(); // Create the sign extension. MachineInstrBuilder MIBSXTW = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) .addReg(DstRegX) .addImm(0) .addImm(31); (void)MIBSXTW; DEBUG(dbgs() << " Extend operand:\n "); DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); } else { DEBUG(((MachineInstr *)MIB)->print(dbgs())); } DEBUG(dbgs() << "\n"); // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); return NextI; } MachineBasicBlock::iterator AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator StoreI) { MachineBasicBlock::iterator NextI = LoadI; ++NextI; int LoadSize = getMemScale(*LoadI); int StoreSize = getMemScale(*StoreI); unsigned LdRt = getLdStRegOp(*LoadI).getReg(); unsigned StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); assert((IsStoreXReg || TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && "Unexpected RegClass"); MachineInstr *BitExtMI; if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { // Remove the load, if the destination register of the loads is the same // register for stored value. if (StRt == LdRt && LoadSize == 8) { DEBUG(dbgs() << "Remove load instruction:\n "); DEBUG(LoadI->print(dbgs())); DEBUG(dbgs() << "\n"); LoadI->eraseFromParent(); return NextI; } // Replace the load with a mov if the load and store are in the same size. BitExtMI = BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) .addReg(StRt) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { // FIXME: Currently we disable this transformation in big-endian targets as // performance and correctness are verified only in little-endian. if (!Subtarget->isLittleEndian()) return NextI; bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); int UnscaledLdOffset = IsUnscaled ? getLdStOffsetOp(*LoadI).getImm() : getLdStOffsetOp(*LoadI).getImm() * LoadSize; int UnscaledStOffset = IsUnscaled ? getLdStOffsetOp(*StoreI).getImm() : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); int Imms = Immr + Width - 1; unsigned DestReg = IsStoreXReg ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, &AArch64::GPR64RegClass) : LdRt; assert((UnscaledLdOffset >= UnscaledStOffset && (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && "Invalid offset"); Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); Imms = Immr + Width - 1; if (UnscaledLdOffset == UnscaledStOffset) { uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N | ((Immr) << 6) // immr | ((Imms) << 0) // imms ; BitExtMI = BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), DestReg) .addReg(StRt) .addImm(AndMaskEncoded); } else { BitExtMI = BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), DestReg) .addReg(StRt) .addImm(Immr) .addImm(Imms); } } (void)BitExtMI; DEBUG(dbgs() << "Promoting load by replacing :\n "); DEBUG(StoreI->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(LoadI->print(dbgs())); DEBUG(dbgs() << " with instructions:\n "); DEBUG(StoreI->print(dbgs())); DEBUG(dbgs() << " "); DEBUG((BitExtMI)->print(dbgs())); DEBUG(dbgs() << "\n"); // Erase the old instructions. LoadI->eraseFromParent(); return NextI; } /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { for (const MachineOperand &MO : MI.operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; if (MO.isDef()) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) ModifiedRegs.set(*AI); } else { assert(MO.isUse() && "Reg operand not a def and not a use?!?"); for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) UsedRegs.set(*AI); } } } static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. if (IsUnscaled) { // If the byte-offset isn't a multiple of the stride, there's no point // trying to match it. if (Offset % OffsetStride) return false; Offset /= OffsetStride; } return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, // avoiding having to do a C-style cast from uint_64t to int when // using alignTo from include/llvm/Support/MathExtras.h. // FIXME: Move this function to include/MathExtras.h? static int alignTo(int Num, int PowOf2) { return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, const AArch64InstrInfo *TII) { // One of the instructions must modify memory. if (!MIa.mayStore() && !MIb.mayStore()) return false; // Both instructions must be memory operations. if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) return false; return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); } static bool mayAlias(MachineInstr &MIa, SmallVectorImpl &MemInsns, const AArch64InstrInfo *TII) { for (MachineInstr *MIb : MemInsns) if (mayAlias(MIa, *MIb, TII)) return true; return false; } bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI) { MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; MachineInstr &LoadMI = *I; unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); // If the load is the first instruction in the block, there's obviously // not any matching store. if (MBBI == B) return false; // Track which registers have been modified and used between the first insn // and the second insn. ModifiedRegs.reset(); UsedRegs.reset(); unsigned Count = 0; do { --MBBI; MachineInstr &MI = *MBBI; // Don't count DBG_VALUE instructions towards the search limit. if (!MI.isDebugValue()) ++Count; // If the load instruction reads directly from the address to which the // store instruction writes and the stored value is not modified, we can // promote the load. Since we do not handle stores with pre-/post-index, // it's unnecessary to check if BaseReg is modified by the store itself. if (MI.mayStore() && isMatchingStore(LoadMI, MI) && BaseReg == getLdStBaseOp(MI).getReg() && isLdOffsetInRangeOfSt(LoadMI, MI, TII) && !ModifiedRegs[getLdStRegOp(MI).getReg()]) { StoreI = MBBI; return true; } if (MI.isCall()) return false; // Update modified / uses register lists. trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); // Otherwise, if the base register is modified, we have no match, so // return early. if (ModifiedRegs[BaseReg]) return false; // If we encounter a store aliased with the load, return early. if (MI.mayStore() && mayAlias(LoadMI, MI, TII)) return false; } while (MBBI != B && Count < Limit); return false; } // Returns true if FirstMI and MI are candidates for merging or pairing. // Otherwise, returns false. static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, LdStPairFlags &Flags, const AArch64InstrInfo *TII) { // If this is volatile or if pairing is suppressed, not a candidate. if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) return false; // We should have already checked FirstMI for pair suppression and volatility. assert(!FirstMI.hasOrderedMemoryRef() && !TII->isLdStPairSuppressed(FirstMI) && "FirstMI shouldn't get here if either of these checks are true."); unsigned OpcA = FirstMI.getOpcode(); unsigned OpcB = MI.getOpcode(); // Opcodes match: nothing more to check. if (OpcA == OpcB) return true; // Try to match a sign-extended load/store with a zero-extended load/store. bool IsValidLdStrOpc, PairIsValidLdStrOpc; unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); assert(IsValidLdStrOpc && "Given Opc should be a Load or Store with an immediate"); // OpcA will be the first instruction in the pair. if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); return true; } // If the second instruction isn't even a load/store, bail out. if (!PairIsValidLdStrOpc) return false; // FIXME: We don't support merging narrow loads/stores with mixed // scaled/unscaled offsets. if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB)) return false; // Try to match an unscaled load/store with a scaled load/store. return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? } /// Scan the instructions looking for a load/store that can be combined with the /// current instruction into a wider equivalent or a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, LdStPairFlags &Flags, unsigned Limit, bool FindNarrowMerge) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr &FirstMI = *I; ++MBBI; bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); unsigned Reg = getLdStRegOp(FirstMI).getReg(); unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. ModifiedRegs.reset(); UsedRegs.reset(); // Remember any instructions that read/write memory between FirstMI and MI. SmallVector MemInsns; for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { MachineInstr &MI = *MBBI; // Skip DBG_VALUE instructions. Otherwise debug info can affect the // optimization by changing how far we scan. if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; Flags.setSExtIdx(-1); if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && getLdStOffsetOp(MI).isImm()) { assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); int MIOffset = getLdStOffsetOp(MI).getImm(); bool MIIsUnscaled = TII->isUnscaledLdSt(MI); if (IsUnscaled != MIIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. // If FirstMI is scaled then scale the offset of MI accordingly. // Otherwise, do the opposite (i.e., make MI's offset unscaled). int MemSize = getMemScale(MI); if (MIIsUnscaled) { // If the unscaled offset isn't a multiple of the MemSize, we can't // pair the operations together: bail and keep looking. - if (MIOffset % MemSize) + if (MIOffset % MemSize) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(&MI); continue; + } MIOffset /= MemSize; } else { MIOffset *= MemSize; } } if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; if (FindNarrowMerge) { // If the alignment requirements of the scaled wide load/store // instruction can't express the offset of the scaled narrow input, // bail and keep looking. For promotable zero stores, allow only when // the stored value is the same (i.e., WZR). if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(&MI); continue; } } else { // Pairwise instructions have a 7-bit signed offset field. Single // insns have a 12-bit unsigned offset field. If the resultant // immediate offset of merging these instructions is out of range for // a pairwise instruction, bail and keep looking. if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(&MI); continue; } // If the alignment requirements of the paired (scaled) instruction // can't express the offset of the unscaled input, bail and keep // looking. if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(&MI); continue; } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(&MI); continue; } // If the Rt of the second instruction was not modified or used between // the two instructions and none of the instructions between the second // and first alias with the second, we can combine the second into the // first. if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { Flags.setMergeForward(false); return MBBI; } // Likewise, if the Rt of the first instruction is not modified or used // between the two instructions and none of the instructions between the // first and the second alias with the first, we can combine the first // into the second. if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && !mayAlias(FirstMI, MemInsns, TII)) { Flags.setMergeForward(true); return MBBI; } // Unable to combine these instructions due to interference in between. // Keep looking. } } // If the instruction wasn't a matching load or store. Stop searching if we // encounter a call instruction that might modify memory. if (MI.isCall()) return E; // Update modified / uses register lists. trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); // Otherwise, if the base register is modified, we have no match, so // return early. if (ModifiedRegs[BaseReg]) return E; // Update list of instructions that read/write memory. if (MI.mayLoadOrStore()) MemInsns.push_back(&MI); } return E; } MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx) { assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); MachineBasicBlock::iterator NextI = I; // Return the instruction following the merged instruction, which is // the instruction following our unmerged load. Unless that's the add/sub // instruction we're merging, in which case it's the one after that. if (++NextI == Update) ++NextI; int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); if (Update->getOpcode() == AArch64::SUBXri) Value = -Value; unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .addOperand(getLdStRegOp(*Update)) .addOperand(getLdStRegOp(*I)) .addOperand(getLdStBaseOp(*I)) .addImm(Value) .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } else { // Paired instruction. int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .addOperand(getLdStRegOp(*Update)) .addOperand(getLdStRegOp(*I, 0)) .addOperand(getLdStRegOp(*I, 1)) .addOperand(getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } (void)MIB; if (IsPreIdx) DEBUG(dbgs() << "Creating pre-indexed load/store."); else DEBUG(dbgs() << "Creating post-indexed load/store."); DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(Update->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); DEBUG(((MachineInstr *)MIB)->print(dbgs())); DEBUG(dbgs() << "\n"); // Erase the old instructions for the block. I->eraseFromParent(); Update->eraseFromParent(); return NextI; } bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset) { switch (MI.getOpcode()) { default: break; case AArch64::SUBXri: - // Negate the offset for a SUB instruction. - Offset *= -1; - // FALLTHROUGH case AArch64::ADDXri: // Make sure it's a vanilla immediate operand, not a relocation or // anything else we can't handle. if (!MI.getOperand(2).isImm()) break; // Watch out for 1 << 12 shifted value. if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) break; // The update instruction source and destination register must be the // same as the load/store base register. if (MI.getOperand(0).getReg() != BaseReg || MI.getOperand(1).getReg() != BaseReg) break; bool IsPairedInsn = isPairedLdSt(MemMI); int UpdateOffset = MI.getOperand(2).getImm(); + if (MI.getOpcode() == AArch64::SUBXri) + UpdateOffset = -UpdateOffset; + // For non-paired load/store instructions, the immediate must fit in a // signed 9-bit integer. if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) break; // For paired load/store instructions, the immediate must be a multiple of // the scaling factor. The scaled offset must also fit into a signed 7-bit // integer. if (IsPairedInsn) { int Scale = getMemScale(MemMI); if (UpdateOffset % Scale != 0) break; int ScaledOffset = UpdateOffset / Scale; - if (ScaledOffset > 64 || ScaledOffset < -64) + if (ScaledOffset > 63 || ScaledOffset < -64) break; } // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. - if (!Offset || Offset == MI.getOperand(2).getImm()) + if (!Offset || Offset == UpdateOffset) return true; break; } return false; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're // looking for. if (MIUnscaledOffset != UnscaledOffset) return E; // If the base register overlaps a destination register, we can't // merge the update. bool IsPairedInsn = isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) return E; } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. ModifiedRegs.reset(); UsedRegs.reset(); ++MBBI; for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { MachineInstr &MI = *MBBI; // Skip DBG_VALUE instructions. if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; // If we found a match, return it. if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); // Otherwise, if the base register is used or modified, we have no match, so // return early. if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) return E; } return E; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator I, unsigned Limit) { MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. if (MBBI == B || Offset != 0) return E; // If the base register overlaps a destination register, we can't // merge the update. bool IsPairedInsn = isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) return E; } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. ModifiedRegs.reset(); UsedRegs.reset(); unsigned Count = 0; do { --MBBI; MachineInstr &MI = *MBBI; // Don't count DBG_VALUE instructions towards the search limit. if (!MI.isDebugValue()) ++Count; // If we found a match, return it. if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); // Otherwise, if the base register is used or modified, we have no match, so // return early. if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) return E; } while (MBBI != B && Count < Limit); return E; } bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; // If this is a volatile load, don't mess with it. if (MI.hasOrderedMemoryRef()) return false; // Make sure this is a reg+imm. // FIXME: It is possible to extend it to handle reg+reg cases. if (!getLdStOffsetOp(MI).isImm()) return false; // Look backward up to LdStLimit instructions. MachineBasicBlock::iterator StoreI; if (findMatchingStore(MBBI, LdStLimit, StoreI)) { ++NumLoadsFromStoresPromoted; // Promote the load. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction // is after it's done mucking about. MBBI = promoteLoadFromStore(MBBI, StoreI); return true; } return false; } // Find narrow loads that can be converted into a single wider load with // bitfield extract instructions. Also merge adjacent zero stores into a wider // store. bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) && "Expected narrow op."); MachineInstr &MI = *MBBI; MachineBasicBlock::iterator E = MI.getParent()->end(); if (!TII->isCandidateToMergeOrPair(MI)) return false; // For promotable zero stores, the stored value should be WZR. if (isPromotableZeroStoreOpcode(MI) && getLdStRegOp(MI).getReg() != AArch64::WZR) return false; // Look ahead up to LdStLimit instructions for a mergable instruction. LdStPairFlags Flags; MachineBasicBlock::iterator MergeMI = findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); if (MergeMI != E) { if (isNarrowLoad(MI)) { ++NumNarrowLoadsPromoted; } else if (isPromotableZeroStoreInst(MI)) { ++NumZeroStoresPromoted; } // Keeping the iterator straight is a pain, so we let the merge routine tell // us what the next instruction is after it's done mucking about. MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags); return true; } return false; } // Find loads and stores that can be merged into a single load or store pair // instruction. bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock::iterator E = MI.getParent()->end(); if (!TII->isCandidateToMergeOrPair(MI)) return false; // Early exit if the offset is not possible to match. (6 bits of positive // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) bool IsUnscaled = TII->isUnscaledLdSt(MI); int Offset = getLdStOffsetOp(MI).getImm(); int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return false; // Look ahead up to LdStLimit instructions for a pairable instruction. LdStPairFlags Flags; MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); if (Paired != E) { ++NumPairCreated; if (TII->isUnscaledLdSt(MI)) ++NumUnscaledPairCreated; // Keeping the iterator straight is a pain, so we let the merge routine tell // us what the next instruction is after it's done mucking about. MBBI = mergePairedInsns(MBBI, Paired, Flags); return true; } return false; } bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt) { bool Modified = false; // Four tranformations to do here: // 1) Find loads that directly read from stores and promote them by // replacing with mov instructions. If the store is wider than the load, // the load will be replaced with a bitfield extract. // e.g., // str w1, [x0, #4] // ldrh w2, [x0, #6] // ; becomes // str w1, [x0, #4] // lsr w2, w1, #16 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { MachineInstr &MI = *MBBI; switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; break; // Scaled instructions. case AArch64::LDRBBui: case AArch64::LDRHHui: case AArch64::LDRWui: case AArch64::LDRXui: // Unscaled instructions. case AArch64::LDURBBi: case AArch64::LDURHHi: case AArch64::LDURWi: case AArch64::LDURXi: { if (tryToPromoteLoadFromStore(MBBI)) { Modified = true; break; } ++MBBI; break; } } } // 2) Find narrow loads that can be converted into a single wider load // with bitfield extract instructions. // e.g., // ldrh w0, [x2] // ldrh w1, [x2, #2] // ; becomes // ldr w0, [x2] // ubfx w1, w0, #16, #16 // and w0, w0, #ffff // // Also merge adjacent zero stores into a wider store. // e.g., // strh wzr, [x0] // strh wzr, [x0, #2] // ; becomes // str wzr, [x0] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); enableNarrowLdOpt && MBBI != E;) { MachineInstr &MI = *MBBI; unsigned Opc = MI.getOpcode(); if (isPromotableZeroStoreOpcode(Opc) || (EnableNarrowLdMerge && isNarrowLoad(Opc))) { if (tryToMergeLdStInst(MBBI)) { Modified = true; } else ++MBBI; } else ++MBBI; } // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { MachineInstr &MI = *MBBI; switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; break; // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: case AArch64::STURWi: case AArch64::STURXi: case AArch64::LDURSi: case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { if (tryToPairLdStInst(MBBI)) { Modified = true; break; } ++MBBI; break; } } } // 4) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] // add x2, x2, #4 // ; becomes // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { MachineInstr &MI = *MBBI; // Do update merging. It's simpler to keep this separate from the above // switchs, though not strictly necessary. unsigned Opc = MI.getOpcode(); switch (Opc) { default: // Just move on to the next instruction. ++MBBI; break; // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: case AArch64::STRHHui: case AArch64::STRBBui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRHHui: case AArch64::LDRBBui: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: case AArch64::STURWi: case AArch64::STURXi: case AArch64::LDURSi: case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: case AArch64::LDURXi: // Paired instructions. case AArch64::LDPSi: case AArch64::LDPSWi: case AArch64::LDPDi: case AArch64::LDPQi: case AArch64::LDPWi: case AArch64::LDPXi: case AArch64::STPSi: case AArch64::STPDi: case AArch64::STPQi: case AArch64::STPWi: case AArch64::STPXi: { // Make sure this is a reg+imm (as opposed to an address reloc). if (!getLdStOffsetOp(MI).isImm()) { ++MBBI; break; } // Look forward to try to form a post-index instruction. For example, // ldr x0, [x20] // add x20, x20, #32 // merged into: // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); Modified = true; ++NumPostFolded; break; } // Don't know how to handle pre/post-index versions, so move to the next // instruction. if (TII->isUnscaledLdSt(Opc)) { ++MBBI; break; } // Look back to try to find a pre-index instruction. For example, // add x0, x0, #8 // ldr x1, [x0] // merged into: // ldr x1, [x0, #8]! Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; } // The immediate in the load/store is scaled by the size of the memory // operation. The immediate in the add we're looking for, // however, is not, so adjust here. int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; } // Nothing found. Just move to the next instruction. ++MBBI; break; } } } return Modified; } bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(*Fn.getFunction())) return false; Subtarget = &static_cast(Fn.getSubtarget()); TII = static_cast(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); // Resize the modified and used register bitfield trackers. We do this once // per function and then clear the bitfield each time we optimize a load or // store. ModifiedRegs.resize(TRI->getNumRegs()); UsedRegs.resize(TRI->getNumRegs()); bool Modified = false; bool enableNarrowLdOpt = Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; } // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? // FIXME: When pairing store instructions it's very possible for this pass to // hoist a store with a KILL marker above another use (without a KILL marker). // The resulting IR is invalid, but nothing uses the KILL markers after this // pass, so it's never caused a problem in practice. /// createAArch64LoadStoreOptimizationPass - returns an instance of the /// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { return new AArch64LoadStoreOpt(); } Index: projects/clang390-import/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- projects/clang390-import/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp (revision 304769) +++ projects/clang390-import/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp (revision 304770) @@ -1,12090 +1,12097 @@ //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the PPCISelLowering class. // //===----------------------------------------------------------------------===// #include "PPCISelLowering.h" #include "MCTargetDesc/PPCPredicates.h" #include "PPCCallingConv.h" #include "PPCCCState.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" #include "PPCTargetMachine.h" #include "PPCTargetObjectFile.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include using namespace llvm; #define DEBUG_TYPE "ppc-lowering" static cl::opt DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); static cl::opt DisableILPPref("disable-ppc-ilp-pref", cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); static cl::opt DisablePPCUnaligned("disable-ppc-unaligned", cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); static cl::opt DisableSCO("disable-ppc-sco", cl::desc("disable sibling call optimization on ppc"), cl::Hidden); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. bool isPPC64 = Subtarget.isPPC64(); setMinStackArgumentAlignment(isPPC64 ? 8:4); // Set up the register classes. addRegisterClass(MVT::i32, &PPC::GPRCRegClass); if (!useSoftFloat()) { addRegisterClass(MVT::f32, &PPC::F4RCRegClass); addRegisterClass(MVT::f64, &PPC::F8RCRegClass); } // PowerPC has an i16 but no i8 (or i1) SEXTLOAD for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); } setTruncStoreAction(MVT::f64, MVT::f32, Expand); // PowerPC has pre-inc load and store's. setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); if (isPPC64 || Subtarget.hasFPCVT()) { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); } else { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); } // PowerPC does not support direct load / store of condition registers setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); // FIXME: Remove this once the ANDI glue bug is fixed: if (ANDIGlueBug) setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); setTruncStoreAction(VT, MVT::i1, Expand); } addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); } // This is used in the ppcf128->int sequence. Note it has different semantics // from FP_ROUND: that rounds to nearest, this rounds to zero. setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom); // We do not currently implement these libm ops for PowerPC. setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); setOperationAction(ISD::FREM, MVT::ppcf128, Expand); // PowerPC has no SREM/UREM instructions setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i64, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); // We don't support sin/cos/sqrt/fmod/pow setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FREM , MVT::f64, Expand); setOperationAction(ISD::FPOW , MVT::f64, Expand); setOperationAction(ISD::FMA , MVT::f64, Legal); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM , MVT::f32, Expand); setOperationAction(ISD::FPOW , MVT::f32, Expand); setOperationAction(ISD::FMA , MVT::f32, Legal); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); if (Subtarget.hasFCPSGN()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); } else { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); } if (Subtarget.hasFPRND()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); } // PowerPC does not have BSWAP, CTPOP or CTTZ setOperationAction(ISD::BSWAP, MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::BSWAP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { setOperationAction(ISD::CTPOP, MVT::i32 , Legal); setOperationAction(ISD::CTPOP, MVT::i64 , Legal); } else { setOperationAction(ISD::CTPOP, MVT::i32 , Expand); setOperationAction(ISD::CTPOP, MVT::i64 , Expand); } // PowerPC does not have ROTR setOperationAction(ISD::ROTR, MVT::i32 , Expand); setOperationAction(ISD::ROTR, MVT::i64 , Expand); if (!Subtarget.useCRBits()) { // PowerPC does not have Select setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::i64, Expand); setOperationAction(ISD::SELECT, MVT::f32, Expand); setOperationAction(ISD::SELECT, MVT::f64, Expand); } // PowerPC wants to turn select_cc of FP into fsel when possible. setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); // PowerPC wants to optimize integer setcc a bit if (!Subtarget.useCRBits()) setOperationAction(ISD::SETCC, MVT::i32, Custom); // PowerPC does not have BRCOND which requires SetCC if (!Subtarget.useCRBits()) setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); // PowerPC does not have [U|S]INT_TO_FP setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); if (Subtarget.hasDirectMove() && isPPC64) { setOperationAction(ISD::BITCAST, MVT::f32, Legal); setOperationAction(ISD::BITCAST, MVT::i32, Legal); setOperationAction(ISD::BITCAST, MVT::i64, Legal); setOperationAction(ISD::BITCAST, MVT::f64, Legal); } else { setOperationAction(ISD::BITCAST, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i32, Expand); setOperationAction(ISD::BITCAST, MVT::i64, Expand); setOperationAction(ISD::BITCAST, MVT::f64, Expand); } // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // We want to legalize GlobalAddress and ConstantPool nodes into the // appropriate instructions to materialize the address. setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::JumpTable, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::BlockAddress, MVT::i64, Custom); setOperationAction(ISD::ConstantPool, MVT::i64, Custom); setOperationAction(ISD::JumpTable, MVT::i64, Custom); // TRAP is legal. setOperationAction(ISD::TRAP, MVT::Other, Legal); // TRAMPOLINE is custom lowered. setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); if (Subtarget.isSVR4ABI()) { if (isPPC64) { // VAARG always uses double-word chunks, so promote anything smaller. setOperationAction(ISD::VAARG, MVT::i1, Promote); AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); setOperationAction(ISD::VAARG, MVT::i8, Promote); AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); setOperationAction(ISD::VAARG, MVT::i16, Promote); AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); setOperationAction(ISD::VAARG, MVT::i32, Promote); AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); setOperationAction(ISD::VAARG, MVT::Other, Expand); } else { // VAARG is custom lowered with the 32-bit SVR4 ABI. setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::i64, Custom); } } else setOperationAction(ISD::VAARG, MVT::Other, Expand); if (Subtarget.isSVR4ABI() && !isPPC64) // VACOPY is custom lowered with the 32-bit SVR4 ABI. setOperationAction(ISD::VACOPY , MVT::Other, Custom); else setOperationAction(ISD::VACOPY , MVT::Other, Expand); // Use the default implementation. setOperationAction(ISD::VAEND , MVT::Other, Expand); setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); // Comparisons that require checking two conditions. setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); // This is just the low 32 bits of a (signed) fp->i64 conversion. // We cannot do this with Promote because i64 is not a legal type. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); } else { // PowerPC does not have FP_TO_UINT on 32-bit implementations. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); } // With the instructions enabled under FPCVT, we can do everything. if (Subtarget.hasFPCVT()) { if (Subtarget.has64BitSupport()) { setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); } setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); } if (Subtarget.use64BitRegs()) { // 64-bit PowerPC implementations can support i64 types directly addRegisterClass(MVT::i64, &PPC::G8RCRegClass); // BUILD_PAIR can't be handled natively, and should be expanded to shl/or setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // 64-bit PowerPC wants to expand i128 shifts itself. setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); } else { // 32-bit PowerPC wants to expand i64 shifts itself. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { setOperationAction(ISD::CTPOP, VT, Legal); setOperationAction(ISD::CTLZ, VT, Legal); } else { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); } // We promote all shuffles to v16i8. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); // We promote all non-typed operations to v4i32. setOperationAction(ISD::AND , VT, Promote); AddPromotedToType (ISD::AND , VT, MVT::v4i32); setOperationAction(ISD::OR , VT, Promote); AddPromotedToType (ISD::OR , VT, MVT::v4i32); setOperationAction(ISD::XOR , VT, Promote); AddPromotedToType (ISD::XOR , VT, MVT::v4i32); setOperationAction(ISD::LOAD , VT, Promote); AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); setOperationAction(ISD::SELECT_CC, VT, Promote); AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); setOperationAction(ISD::STORE, VT, Promote); AddPromotedToType (ISD::STORE, VT, MVT::v4i32); // No other operations are legal. setOperationAction(ISD::MUL , VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); setOperationAction(ISD::AND , MVT::v4i32, Legal); setOperationAction(ISD::OR , MVT::v4i32, Legal); setOperationAction(ISD::XOR , MVT::v4i32, Legal); setOperationAction(ISD::LOAD , MVT::v4i32, Legal); setOperationAction(ISD::SELECT, MVT::v4i32, Subtarget.useCRBits() ? Legal : Expand); setOperationAction(ISD::STORE , MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); setOperationAction(ISD::MUL, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } if (Subtarget.hasP8Altivec()) setOperationAction(ISD::MUL, MVT::v4i32, Legal); else setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); // Altivec does not contain unordered floating-point compare instructions setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); if (Subtarget.hasP8Vector()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); } if (Subtarget.hasDirectMove() && isPPC64) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); setOperationAction(ISD::FROUND, MVT::v2f64, Legal); setOperationAction(ISD::FROUND, MVT::v4f32, Legal); setOperationAction(ISD::MUL, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::STORE, MVT::v2f64, Legal); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); if (Subtarget.hasP8Vector()) addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); if (Subtarget.hasP8Altivec()) { setOperationAction(ISD::SHL, MVT::v2i64, Legal); setOperationAction(ISD::SRA, MVT::v2i64, Legal); setOperationAction(ISD::SRL, MVT::v2i64, Legal); setOperationAction(ISD::SETCC, MVT::v2i64, Legal); } else { setOperationAction(ISD::SHL, MVT::v2i64, Expand); setOperationAction(ISD::SRA, MVT::v2i64, Expand); setOperationAction(ISD::SRL, MVT::v2i64, Expand); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); // VSX v2i64 only supports non-arithmetic operations. setOperationAction(ISD::ADD, MVT::v2i64, Expand); setOperationAction(ISD::SUB, MVT::v2i64, Expand); } setOperationAction(ISD::LOAD, MVT::v2i64, Promote); AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); setOperationAction(ISD::STORE, MVT::v2i64, Promote); AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); // Vector operation legalization checks the result type of // SIGN_EXTEND_INREG, overall legalization checks the inner type. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::FNEG, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Legal); setOperationAction(ISD::FABS, MVT::v4f32, Legal); setOperationAction(ISD::FABS, MVT::v2f64, Legal); addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); } if (Subtarget.hasP8Altivec()) { addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); } if (Subtarget.hasP9Vector()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal); } } if (Subtarget.hasQPX()) { setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FREM, MVT::v4f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); setOperationAction(ISD::LOAD , MVT::v4f64, Custom); setOperationAction(ISD::STORE , MVT::v4f64, Custom); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); if (!Subtarget.useCRBits()) setOperationAction(ISD::SELECT, MVT::v4f64, Expand); setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); setOperationAction(ISD::FNEG , MVT::v4f64, Legal); setOperationAction(ISD::FABS , MVT::v4f64, Legal); setOperationAction(ISD::FSIN , MVT::v4f64, Expand); setOperationAction(ISD::FCOS , MVT::v4f64, Expand); setOperationAction(ISD::FPOWI , MVT::v4f64, Expand); setOperationAction(ISD::FPOW , MVT::v4f64, Expand); setOperationAction(ISD::FLOG , MVT::v4f64, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); setOperationAction(ISD::FEXP , MVT::v4f64, Expand); setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FREM, MVT::v4f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); setOperationAction(ISD::LOAD , MVT::v4f32, Custom); setOperationAction(ISD::STORE , MVT::v4f32, Custom); if (!Subtarget.useCRBits()) setOperationAction(ISD::SELECT, MVT::v4f32, Expand); setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); setOperationAction(ISD::FNEG , MVT::v4f32, Legal); setOperationAction(ISD::FABS , MVT::v4f32, Legal); setOperationAction(ISD::FSIN , MVT::v4f32, Expand); setOperationAction(ISD::FCOS , MVT::v4f32, Expand); setOperationAction(ISD::FPOWI , MVT::v4f32, Expand); setOperationAction(ISD::FPOW , MVT::v4f32, Expand); setOperationAction(ISD::FLOG , MVT::v4f32, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); setOperationAction(ISD::FEXP , MVT::v4f32, Expand); setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); setOperationAction(ISD::AND , MVT::v4i1, Legal); setOperationAction(ISD::OR , MVT::v4i1, Legal); setOperationAction(ISD::XOR , MVT::v4i1, Legal); if (!Subtarget.useCRBits()) setOperationAction(ISD::SELECT, MVT::v4i1, Expand); setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); setOperationAction(ISD::LOAD , MVT::v4i1, Custom); setOperationAction(ISD::STORE , MVT::v4i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); setOperationAction(ISD::FROUND, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FROUND, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); // These need to set FE_INEXACT, and so cannot be vectorized here. setOperationAction(ISD::FRINT, MVT::v4f64, Expand); setOperationAction(ISD::FRINT, MVT::v4f32, Expand); if (TM.Options.UnsafeFPMath) { setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } else { setOperationAction(ISD::FDIV, MVT::v4f64, Expand); setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); setOperationAction(ISD::FDIV, MVT::v4f32, Expand); setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); } } if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); if (!isPPC64) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); } setBooleanContents(ZeroOrOneBooleanContent); if (Subtarget.hasAltivec()) { // Altivec instructions set fields to all zeros or all ones. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); } if (!isPPC64) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); } setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); if (Subtarget.useCRBits()) setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::SELECT_CC); } // Use reciprocal estimates. if (TM.Options.UnsafeFPMath) { setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::FSQRT); } // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); } // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. if (Subtarget.useCRBits()) { setHasMultipleConditionRegisters(); setJumpIsExpensive(); } setMinFunctionAlignment(2); if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); switch (Subtarget.getDarwinDirective()) { default: break; case PPC::DIR_970: case PPC::DIR_A2: case PPC::DIR_E500mc: case PPC::DIR_E5500: case PPC::DIR_PWR4: case PPC::DIR_PWR5: case PPC::DIR_PWR5X: case PPC::DIR_PWR6: case PPC::DIR_PWR6X: case PPC::DIR_PWR7: case PPC::DIR_PWR8: case PPC::DIR_PWR9: setPrefFunctionAlignment(4); setPrefLoopAlignment(4); break; } if (Subtarget.enableMachineScheduler()) setSchedulingPreference(Sched::Source); else setSchedulingPreference(Sched::Hybrid); computeRegisterProperties(STI.getRegisterInfo()); // The Freescale cores do better with aggressive inlining of memcpy and // friends. GCC uses same threshold of 128 bytes (= 32 word stores). if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || Subtarget.getDarwinDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; MaxStoresPerMemsetOptSize = 16; MaxStoresPerMemcpy = 32; MaxStoresPerMemcpyOptSize = 8; MaxStoresPerMemmove = 32; MaxStoresPerMemmoveOptSize = 8; } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { // The A2 also benefits from (very) aggressive inlining of memcpy and // friends. The overhead of a the function call, even when warm, can be // over one hundred cycles. MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; } } /// getMaxByValAlign - Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, unsigned MaxMaxAlign) { if (MaxAlign == MaxMaxAlign) return; if (VectorType *VTy = dyn_cast(Ty)) { if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) MaxAlign = 32; else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == MaxMaxAlign) break; } } } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { // Darwin passes everything on 4 byte boundary. if (Subtarget.isDarwin()) return 4; // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. unsigned Align = Subtarget.isPPC64() ? 8 : 4; if (Subtarget.hasAltivec() || Subtarget.hasQPX()) getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); return Align; } bool PPCTargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; case PPCISD::FSEL: return "PPCISD::FSEL"; case PPCISD::FCFID: return "PPCISD::FCFID"; case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; case PPCISD::SHL: return "PPCISD::SHL"; case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; case PPCISD::CALL: return "PPCISD::CALL"; case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; case PPCISD::BCTRL: return "PPCISD::BCTRL"; case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; case PPCISD::MFVSR: return "PPCISD::MFVSR"; case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; case PPCISD::VCMP: return "PPCISD::VCMP"; case PPCISD::VCMPo: return "PPCISD::VCMPo"; case PPCISD::LBRX: return "PPCISD::LBRX"; case PPCISD::STBRX: return "PPCISD::STBRX"; case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; case PPCISD::BDNZ: return "PPCISD::BDNZ"; case PPCISD::BDZ: return "PPCISD::BDZ"; case PPCISD::MFFS: return "PPCISD::MFFS"; case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; case PPCISD::CR6SET: return "PPCISD::CR6SET"; case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; case PPCISD::SC: return "PPCISD::SC"; case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; case PPCISD::QBFLT: return "PPCISD::QBFLT"; case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; } return nullptr; } EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, EVT VT) const { if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; if (Subtarget.hasQPX()) return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); return VT.changeVectorElementTypeToInteger(); } bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); return true; } //===----------------------------------------------------------------------===// // Node matching predicates, for use by the tblgen matching code. //===----------------------------------------------------------------------===// /// isFloatingPointZero - Return true if this is 0.0 or -0.0. static bool isFloatingPointZero(SDValue Op) { if (ConstantFPSDNode *CFP = dyn_cast(Op)) return CFP->getValueAPF().isZero(); else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { // Maybe this has already been legalized into the constant pool? if (ConstantPoolSDNode *CP = dyn_cast(Op.getOperand(1))) if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isZero(); } return false; } /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return /// true if Op is undef or if it matches the specified value. static bool isConstantOrUndef(int Op, int Val) { return Op < 0 || Op == Val; } /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. /// The ShuffleKind distinguishes between big-endian operations with /// two different inputs (0), either-endian operations with two identical /// inputs (1), and little-endian operations with two different inputs (2). /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; for (unsigned i = 0; i != 16; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) return false; } else if (ShuffleKind == 2) { if (!IsLE) return false; for (unsigned i = 0; i != 16; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2)) return false; } else if (ShuffleKind == 1) { unsigned j = IsLE ? 0 : 1; for (unsigned i = 0; i != 8; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) return false; } return true; } /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. /// The ShuffleKind distinguishes between big-endian operations with /// two different inputs (0), either-endian operations with two identical /// inputs (1), and little-endian operations with two different inputs (2). /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) return false; } else if (ShuffleKind == 2) { if (!IsLE) return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) return false; } else if (ShuffleKind == 1) { unsigned j = IsLE ? 0 : 2; for (unsigned i = 0; i != 8; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) return false; } return true; } /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the /// current subtarget. /// /// The ShuffleKind distinguishes between big-endian operations with /// two different inputs (0), either-endian operations with two identical /// inputs (1), and little-endian operations with two different inputs (2). /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); if (!Subtarget.hasP8Vector()) return false; bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; for (unsigned i = 0; i != 16; i += 4) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) return false; } else if (ShuffleKind == 2) { if (!IsLE) return false; for (unsigned i = 0; i != 16; i += 4) if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) return false; } else if (ShuffleKind == 1) { unsigned j = IsLE ? 0 : 4; for (unsigned i = 0; i != 8; i += 4) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) return false; } return true; } /// isVMerge - Common function, used to match vmrg* shuffles. /// static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned LHSStart, unsigned RHSStart) { if (N->getValueType(0) != MVT::v16i8) return false; assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && "Unsupported merge size!"); for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), LHSStart+j+i*UnitSize) || !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), RHSStart+j+i*UnitSize)) return false; } return true; } /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). /// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { if (DAG.getDataLayout().isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 0, 16); else return false; } else { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 8, 24); else return false; } } /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). /// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { if (DAG.getDataLayout().isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 8, 24); else return false; } else { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 0, 16); else return false; } } /** * \brief Common function used to match vmrgew and vmrgow shuffles * * The indexOffset determines whether to look for even or odd words in * the shuffle mask. This is based on the of the endianness of the target * machine. * - Little Endian: * - Use offset of 0 to check for odd elements * - Use offset of 4 to check for even elements * - Big Endian: * - Use offset of 0 to check for even elements * - Use offset of 4 to check for odd elements * A detailed description of the vector element ordering for little endian and * big endian can be found at * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html * Targeting your applications - what little endian and big endian IBM XL C/C++ * compiler differences mean to you * * The mask to the shuffle vector instruction specifies the indices of the * elements from the two input vectors to place in the result. The elements are * numbered in array-access order, starting with the first vector. These vectors * are always of type v16i8, thus each vector will contain 16 elements of size * 8. More info on the shuffle vector can be found in the * http://llvm.org/docs/LangRef.html#shufflevector-instruction * Language Reference. * * The RHSStartValue indicates whether the same input vectors are used (unary) * or two different input vectors are used, based on the following: * - If the instruction uses the same vector for both inputs, the range of the * indices will be 0 to 15. In this case, the RHSStart value passed should * be 0. * - If the instruction has two different vectors then the range of the * indices will be 0 to 31. In this case, the RHSStart value passed should * be 16 (indices 0-15 specify elements in the first vector while indices 16 * to 31 specify elements in the second vector). * * \param[in] N The shuffle vector SD Node to analyze * \param[in] IndexOffset Specifies whether to look for even or odd elements * \param[in] RHSStartValue Specifies the starting index for the righthand input * vector to the shuffle_vector instruction * \return true iff this shuffle vector represents an even or odd word merge */ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, unsigned RHSStartValue) { if (N->getValueType(0) != MVT::v16i8) return false; for (unsigned i = 0; i < 2; ++i) for (unsigned j = 0; j < 4; ++j) if (!isConstantOrUndef(N->getMaskElt(i*4+j), i*RHSStartValue+j+IndexOffset) || !isConstantOrUndef(N->getMaskElt(i*4+j+8), i*RHSStartValue+j+IndexOffset+8)) return false; return true; } /** * \brief Determine if the specified shuffle mask is suitable for the vmrgew or * vmrgow instructions. * * \param[in] N The shuffle vector SD Node to analyze * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) * \param[in] ShuffleKind Identify the type of merge: * - 0 = big-endian merge with two different inputs; * - 1 = either-endian merge with two identical inputs; * - 2 = little-endian merge with two different inputs (inputs are swapped for * little-endian merges). * \param[in] DAG The current SelectionDAG * \return true iff this shuffle mask */ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG) { if (DAG.getDataLayout().isLittleEndian()) { unsigned indexOffset = CheckEven ? 4 : 0; if (ShuffleKind == 1) // Unary return isVMerge(N, indexOffset, 0); else if (ShuffleKind == 2) // swapped return isVMerge(N, indexOffset, 16); else return false; } else { unsigned indexOffset = CheckEven ? 0 : 4; if (ShuffleKind == 1) // Unary return isVMerge(N, indexOffset, 0); else if (ShuffleKind == 0) // Normal return isVMerge(N, indexOffset, 16); else return false; } return false; } /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. /// The ShuffleKind distinguishes between big-endian operations with two /// different inputs (0), either-endian operations with two identical inputs /// (1), and little-endian operations with two different inputs (2). For the /// latter, the input operands are swapped (see PPCInstrAltivec.td). int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { if (N->getValueType(0) != MVT::v16i8) return -1; ShuffleVectorSDNode *SVOp = cast(N); // Find the first non-undef value in the shuffle mask. unsigned i; for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) /*search*/; if (i == 16) return -1; // all undef. // Otherwise, check to see if the rest of the elements are consecutively // numbered from this value. unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; ShiftAmt -= i; bool isLE = DAG.getDataLayout().isLittleEndian(); if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { // Check the rest of the elements to see if they are consecutive. for (++i; i != 16; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) return -1; } else if (ShuffleKind == 1) { // Check the rest of the elements to see if they are consecutive. for (++i; i != 16; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) return -1; } else return -1; if (isLE) ShiftAmt = 16 - ShiftAmt; return ShiftAmt; } /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a splat of a single element that is suitable for input to /// VSPLTB/VSPLTH/VSPLTW. bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { assert(N->getValueType(0) == MVT::v16i8 && (EltSize == 1 || EltSize == 2 || EltSize == 4)); // The consecutive indices need to specify an element, not part of two // different elements. So abandon ship early if this isn't the case. if (N->getMaskElt(0) % EltSize != 0) return false; // This is a splat operation if each element of the permute is the same, and // if the value doesn't reference the second vector. unsigned ElementBase = N->getMaskElt(0); // FIXME: Handle UNDEF elements too! if (ElementBase >= 16) return false; // Check that the indices are consecutive, in the case of a multi-byte element // splatted with a v16i8 mask. for (unsigned i = 1; i != EltSize; ++i) if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) return false; for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { if (N->getMaskElt(i) < 0) continue; for (unsigned j = 0; j != EltSize; ++j) if (N->getMaskElt(i+j) != N->getMaskElt(j)) return false; } return true; } bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE) { // Check that the mask is shuffling words for (unsigned i = 0; i < 4; ++i) { unsigned B0 = N->getMaskElt(i*4); unsigned B1 = N->getMaskElt(i*4+1); unsigned B2 = N->getMaskElt(i*4+2); unsigned B3 = N->getMaskElt(i*4+3); if (B0 % 4) return false; if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1) return false; } // Now we look at mask elements 0,4,8,12 unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; unsigned M2 = N->getMaskElt(8) / 4; unsigned M3 = N->getMaskElt(12) / 4; unsigned LittleEndianShifts[] = { 2, 1, 0, 3 }; unsigned BigEndianShifts[] = { 3, 0, 1, 2 }; // Below, let H and L be arbitrary elements of the shuffle mask // where H is in the range [4,7] and L is in the range [0,3]. // H, 1, 2, 3 or L, 5, 6, 7 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) || (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) { ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3]; InsertAtByte = IsLE ? 12 : 0; Swap = M0 < 4; return true; } // 0, H, 2, 3 or 4, L, 6, 7 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) || (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) { ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3]; InsertAtByte = IsLE ? 8 : 4; Swap = M1 < 4; return true; } // 0, 1, H, 3 or 4, 5, L, 7 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) || (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) { ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3]; InsertAtByte = IsLE ? 4 : 8; Swap = M2 < 4; return true; } // 0, 1, 2, H or 4, 5, 6, L if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) || (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) { ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3]; InsertAtByte = IsLE ? 0 : 12; Swap = M3 < 4; return true; } // If both vector operands for the shuffle are the same vector, the mask will // contain only elements from the first one and the second one will be undef. if (N->getOperand(1).isUndef()) { ShiftElts = 0; Swap = true; unsigned XXINSERTWSrcElem = IsLE ? 2 : 1; if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) { InsertAtByte = IsLE ? 12 : 0; return true; } if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) { InsertAtByte = IsLE ? 8 : 4; return true; } if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) { InsertAtByte = IsLE ? 4 : 8; return true; } if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) { InsertAtByte = IsLE ? 0 : 12; return true; } } return false; } /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); if (DAG.getDataLayout().isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; } /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed /// by using a vspltis[bhw] instruction of the specified element size, return /// the constant being splatted. The ByteSize field indicates the number of /// bytes of each element [124] -> [bhw]. SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { SDValue OpVal(nullptr, 0); // If ByteSize of the splat is bigger than the element size of the // build_vector, then we have a case where we are checking for a splat where // multiple elements of the buildvector are folded together into a single // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). unsigned EltSize = 16/N->getNumOperands(); if (EltSize < ByteSize) { unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. SDValue UniquedVals[4]; assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); // See if all of the elements in the buildvector agree across. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { if (N->getOperand(i).isUndef()) continue; // If the element isn't a constant, bail fully out. if (!isa(N->getOperand(i))) return SDValue(); if (!UniquedVals[i&(Multiple-1)].getNode()) UniquedVals[i&(Multiple-1)] = N->getOperand(i); else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) return SDValue(); // no match. } // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains // either constant or undef values that are identical for each chunk. See // if these chunks can form into a larger vspltis*. // Check to see if all of the leading entries are either 0 or -1. If // neither, then this won't fit into the immediate field. bool LeadingZero = true; bool LeadingOnes = true; for (unsigned i = 0; i != Multiple-1; ++i) { if (!UniquedVals[i].getNode()) continue; // Must have been undefs. LeadingZero &= isNullConstant(UniquedVals[i]); LeadingOnes &= isAllOnesConstant(UniquedVals[i]); } // Finally, check the least significant entry. if (LeadingZero) { if (!UniquedVals[Multiple-1].getNode()) return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef int Val = cast(UniquedVals[Multiple-1])->getZExtValue(); if (Val < 16) // 0,0,0,4 -> vspltisw(4) return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); } if (LeadingOnes) { if (!UniquedVals[Multiple-1].getNode()) return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef int Val =cast(UniquedVals[Multiple-1])->getSExtValue(); if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); } return SDValue(); } // Check to see if this buildvec has a single non-undef value in its elements. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { if (N->getOperand(i).isUndef()) continue; if (!OpVal.getNode()) OpVal = N->getOperand(i); else if (OpVal != N->getOperand(i)) return SDValue(); } if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. unsigned ValSizeInBytes = EltSize; uint64_t Value = 0; if (ConstantSDNode *CN = dyn_cast(OpVal)) { Value = CN->getZExtValue(); } else if (ConstantFPSDNode *CN = dyn_cast(OpVal)) { assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); Value = FloatToBits(CN->getValueAPF().convertToFloat()); } // If the splat value is larger than the element value, then we can never do // this splat. The only case that we could fit the replicated bits into our // immediate field for would be zero, and we prefer to use vxor for it. if (ValSizeInBytes < ByteSize) return SDValue(); // If the element value is larger than the splat value, check if it consists // of a repeated bit pattern of size ByteSize. if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) return SDValue(); // Properly sign extend the value. int MaskVal = SignExtend32(Value, ByteSize * 8); // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. if (MaskVal == 0) return SDValue(); // Finally, if this value fits in a 5 bit sext field, return it if (SignExtend32<5>(MaskVal) == MaskVal) return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); return SDValue(); } /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift /// amount, otherwise return -1. int PPC::isQVALIGNIShuffleMask(SDNode *N) { EVT VT = N->getValueType(0); if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) return -1; ShuffleVectorSDNode *SVOp = cast(N); // Find the first non-undef value in the shuffle mask. unsigned i; for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) /*search*/; if (i == 4) return -1; // all undef. // Otherwise, check to see if the rest of the elements are consecutively // numbered from this value. unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; ShiftAmt -= i; // Check the rest of the elements to see if they are consecutive. for (++i; i != 4; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) return -1; return ShiftAmt; } //===----------------------------------------------------------------------===// // Addressing Mode Selection //===----------------------------------------------------------------------===// /// isIntS16Immediate - This method tests to see if the node is either a 32-bit /// or 64-bit immediate, and if the value can be accurately represented as a /// sign extension from a 16-bit value. If so, this returns true and the /// immediate. static bool isIntS16Immediate(SDNode *N, short &Imm) { if (!isa(N)) return false; Imm = (short)cast(N)->getZExtValue(); if (N->getValueType(0) == MVT::i32) return Imm == (int32_t)cast(N)->getZExtValue(); else return Imm == (int64_t)cast(N)->getZExtValue(); } static bool isIntS16Immediate(SDValue Op, short &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented with [r+imm]. bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { short imm = 0; if (N.getOpcode() == ISD::ADD) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i if (N.getOperand(1).getOpcode() == PPCISD::Lo) return false; // r+i Base = N.getOperand(0); Index = N.getOperand(1); return true; } else if (N.getOpcode() == ISD::OR) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i can fold it if we can. // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably // disjoint. APInt LHSKnownZero, LHSKnownOne; APInt RHSKnownZero, RHSKnownOne; DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); if (LHSKnownZero.getBoolValue()) { DAG.computeKnownBits(N.getOperand(1), RHSKnownZero, RHSKnownOne); // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (~(LHSKnownZero | RHSKnownZero) == 0) { Base = N.getOperand(0); Index = N.getOperand(1); return true; } } } return false; } // If we happen to be doing an i64 load or store into a stack slot that has // less than a 4-byte alignment, then the frame-index elimination may need to // use an indexed load or store instruction (because the offset may not be a // multiple of 4). The extra register needed to hold the offset comes from the // register scavenger, and it is possible that the scavenger will need to use // an emergency spill slot. As a result, we need to make sure that a spill slot // is allocated when doing an i64 load/store into a less-than-4-byte-aligned // stack slot. static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { // FIXME: This does not handle the LWA case. if (VT != MVT::i64) return; // NOTE: We'll exclude negative FIs here, which come from argument // lowering, because there are no known test cases triggering this problem // using packed structures (or similar). We can remove this exclusion if // we find such a test case. The reason why this is so test-case driven is // because this entire 'fixup' is only to prevent crashes (from the // register scavenger) on not-really-valid inputs. For example, if we have: // %a = alloca i1 // %b = bitcast i1* %a to i64* // store i64* a, i64 b // then the store should really be marked as 'align 1', but is not. If it // were marked as 'align 1' then the indexed form would have been // instruction-selected initially, and the problem this 'fixup' is preventing // won't happen regardless. if (FrameIdx < 0) return; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); unsigned Align = MFI->getObjectAlignment(FrameIdx); if (Align >= 4) return; PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasNonRISpills(); } /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better /// represented as reg+reg. If Aligned is true, only accept displacements /// suitable for STD and friends, i.e. multiples of 4. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, bool Aligned) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. if (SelectAddressRegReg(N, Disp, Base, DAG)) return false; if (N.getOpcode() == ISD::ADD) { short imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (!Aligned || (imm & 3) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else { Base = N.getOperand(0); } return true; // [r+i] } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { // Match LOAD (ADD (X, Lo(G))). assert(!cast(N.getOperand(1).getOperand(1))->getZExtValue() && "Cannot handle constant offsets yet!"); Disp = N.getOperand(1).getOperand(0); // The global address. assert(Disp.getOpcode() == ISD::TargetGlobalAddress || Disp.getOpcode() == ISD::TargetGlobalTLSAddress || Disp.getOpcode() == ISD::TargetConstantPool || Disp.getOpcode() == ISD::TargetJumpTable); Base = N.getOperand(0); return true; // [&g+r] } } else if (N.getOpcode() == ISD::OR) { short imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (!Aligned || (imm & 3) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. APInt LHSKnownZero, LHSKnownOne; DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else { Base = N.getOperand(0); } Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); return true; } } } else if (ConstantSDNode *CN = dyn_cast(N)) { // Loading from a constant address. // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" short Imm; if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); return true; } // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && (!Aligned || (CN->getZExtValue() & 3) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, MVT::i32); unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); return true; } } Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); if (FrameIndexSDNode *FI = dyn_cast(N)) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else Base = N; return true; // [r+0] } /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { // Check to see if we can easily represent this as an [r+r] address. This // will fail if it thinks that the address is more profitably represented as // reg+imm, e.g. where imm = 0. if (SelectAddressRegReg(N, Base, Index, DAG)) return true; // If the operand is an addition, always emit this as [r+r], since this is // better (for code size, and execution, as the memop does the add for free) // than emitting an explicit add. if (N.getOpcode() == ISD::ADD) { Base = N.getOperand(0); Index = N.getOperand(1); return true; } // Otherwise, do it the hard way, using R0 as the base register. Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, N.getValueType()); Index = N; return true; } /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { if (DisablePPCPreinc) return false; bool isLoad = true; SDValue Ptr; EVT VT; unsigned Alignment; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); Alignment = LD->getAlignment(); } else if (StoreSDNode *ST = dyn_cast(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); Alignment = ST->getAlignment(); isLoad = false; } else return false; // PowerPC doesn't have preinc load/store instructions for vectors (except // for QPX, which does have preinc r+r forms). if (VT.isVector()) { if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { return false; } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { AM = ISD::PRE_INC; return true; } } if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer // is a frame index, or if N is a store and the base pointer is either // the same as or a predecessor of the value being stored. Check for // those situations here, and try with swapped Base/Offset instead. bool Swap = false; if (isa(Base) || isa(Base)) Swap = true; else if (!isLoad) { SDValue Val = cast(N)->getValue(); if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) Swap = true; } if (Swap) std::swap(Base, Offset); AM = ISD::PRE_INC; return true; } // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) return false; } if (LoadSDNode *LD = dyn_cast(N)) { // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of // sext i32 to i64 when addr mode is r+i. if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && LD->getExtensionType() == ISD::SEXTLOAD && isa(Offset)) return false; } AM = ISD::PRE_INC; return true; } //===----------------------------------------------------------------------===// // LowerOperation implementation //===----------------------------------------------------------------------===// /// Return true if we should reference labels using a PICBase, set the HiOpFlags /// and LoOpFlags to the target MO flags. static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, unsigned &HiOpFlags, unsigned &LoOpFlags, const GlobalValue *GV = nullptr) { HiOpFlags = PPCII::MO_HA; LoOpFlags = PPCII::MO_LO; // Don't use the pic base if not in PIC relocation model. if (IsPIC) { HiOpFlags |= PPCII::MO_PIC_FLAG; LoOpFlags |= PPCII::MO_PIC_FLAG; } // If this is a reference to a global value that requires a non-lazy-ptr, make // sure that instruction lowering adds it. if (GV && Subtarget.hasLazyResolverStub(GV)) { HiOpFlags |= PPCII::MO_NLP_FLAG; LoOpFlags |= PPCII::MO_NLP_FLAG; if (GV->hasHiddenVisibility()) { HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; } } } static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, SelectionDAG &DAG) { SDLoc DL(HiPart); EVT PtrVT = HiPart.getValueType(); SDValue Zero = DAG.getConstant(0, DL, PtrVT); SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); // With PIC, the first instruction is actually "GR+hi(&G)". if (isPIC) Hi = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); // Generate non-pic code that has direct accesses to the constant pool. // The address of the global is just (hi(&g)+lo(&g)). return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); } static void setUsesTOCBasePtr(MachineFunction &MF) { PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setUsesTOCBasePtr(); } static void setUsesTOCBasePtr(SelectionDAG &DAG) { setUsesTOCBasePtr(DAG.getMachineFunction()); } static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, SDValue GA) { EVT VT = Is64Bit ? MVT::i64 : MVT::i32; SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, false, 0); } SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); ConstantPoolSDNode *CP = cast(Op); const Constant *C = CP->getConstVal(); // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); return getTOCEntry(DAG, SDLoc(CP), true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), PPCII::MO_PIC_FLAG); return getTOCEntry(DAG, SDLoc(CP), false, GA); } SDValue CPIHi = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); SDValue CPILo = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG); } SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); JumpTableSDNode *JT = cast(Op); // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); return getTOCEntry(DAG, SDLoc(JT), true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, PPCII::MO_PIC_FLAG); return getTOCEntry(DAG, SDLoc(GA), false, GA); } SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG); } SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); BlockAddressSDNode *BASDN = cast(Op); const BlockAddress *BA = BASDN->getBlockAddress(); // 64-bit SVR4 ABI code is always position-independent. // The actual BlockAddress is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); return getTOCEntry(DAG, SDLoc(BASDN), true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG); } SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // FIXME: TLS addresses currently use medium model code sequences, // which is the most useful form. Eventually support for small and // large models could be added if users need it, at the cost of // additional complexity. GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool is64bit = Subtarget.isPPC64(); const Module *M = DAG.getMachineFunction().getFunction()->getParent(); PICLevel::Level picLevel = M->getPICLevel(); TLSModel::Model Model = getTargetMachine().getTLSModel(GV); if (Model == TLSModel::LocalExec) { SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_LO); SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, is64bit ? MVT::i64 : MVT::i32); SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); } if (Model == TLSModel::InitialExec) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLS); SDValue GOTPtr; if (is64bit) { setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA); } else GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr); return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); } if (Model == TLSModel::GeneralDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, GOTReg, TGA); } else { if (picLevel == PICLevel::SmallPIC) GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); else GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); } return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, GOTPtr, TGA, TGA); } if (Model == TLSModel::LocalDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, GOTReg, TGA); } else { if (picLevel == PICLevel::SmallPIC) GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); else GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); } SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, PtrVT, GOTPtr, TGA, TGA); SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, TLSAddr, TGA); return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); } llvm_unreachable("Unknown TLS model!"); } SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); GlobalAddressSDNode *GSDN = cast(Op); SDLoc DL(GSDN); const GlobalValue *GV = GSDN->getGlobal(); // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); return getTOCEntry(DAG, DL, true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV); if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), PPCII::MO_PIC_FLAG); return getTOCEntry(DAG, DL, false, GA); } SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); SDValue GALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG); // If the global reference is actually to a non-lazy-pointer, we have to do an // extra load to get the address of the global. if (MOHiFlag & PPCII::MO_NLP_FLAG) Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); return Ptr; } SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc dl(Op); if (Op.getValueType() == MVT::v2i64) { // When the operands themselves are v2i64 values, we need to do something // special because VSX has no underlying comparison operations for these. if (Op.getOperand(0).getValueType() == MVT::v2i64) { // Equality can be handled by casting to the legal type for Altivec // comparisons, everything else needs to be expanded. if (CC == ISD::SETEQ || CC == ISD::SETNE) { return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getSetCC(dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), CC)); } return SDValue(); } // We handle most of these in the usual way. return Op; } // If we're comparing for equality to zero, expose the fact that this is // implemented as a ctlz/srl pair on ppc, so that the dag combiner can // fold the new nodes. if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { if (C->isNullValue() && CC == ISD::SETEQ) { EVT VT = Op.getOperand(0).getValueType(); SDValue Zext = Op.getOperand(0); if (VT.bitsLT(MVT::i32)) { VT = MVT::i32; Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0)); } unsigned Log2b = Log2_32(VT.getSizeInBits()); SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext); SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz, DAG.getConstant(Log2b, dl, MVT::i32)); return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc); } // Leave comparisons against 0 and -1 alone for now, since they're usually // optimized. FIXME: revisit this when we can custom lower all setcc // optimizations. if (C->isAllOnesValue() || C->isNullValue()) return SDValue(); } // If we have an integer seteq/setne, turn it into a compare against zero // by xor'ing the rhs with the lhs, which is faster than setting a // condition register, reading it back out, and masking the correct bit. The // normal approach here uses sub to do this instead of xor. Using xor exposes // the result to other bit-twiddling opportunities. EVT LHSVT = Op.getOperand(0).getValueType(); if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { EVT VT = Op.getValueType(); SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), Op.getOperand(1)); return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); } return SDValue(); } SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); EVT VT = Node->getValueType(0); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue InChain = Node->getOperand(0); SDValue VAListPtr = Node->getOperand(1); const Value *SV = cast(Node->getOperand(2))->getValue(); SDLoc dl(Node); assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); // gpr_index SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, VAListPtr, MachinePointerInfo(SV), MVT::i8); InChain = GprIndex.getValue(1); if (VT == MVT::i64) { // Check if GprIndex is even SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, DAG.getConstant(1, dl, MVT::i32)); SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, DAG.getConstant(1, dl, MVT::i32)); // Align GprIndex to be even if it isn't GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, GprIndex); } // fpr index is 1 byte after gpr SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, DAG.getConstant(1, dl, MVT::i32)); // fpr SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, FprPtr, MachinePointerInfo(SV), MVT::i8); InChain = FprIndex.getValue(1); SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, DAG.getConstant(8, dl, MVT::i32)); SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, DAG.getConstant(4, dl, MVT::i32)); // areas SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo()); InChain = OverflowArea.getValue(1); SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo()); InChain = RegSaveArea.getValue(1); // select overflow_area if index > 8 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); // adjustment constant gpr_index * 4/8 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, DAG.getConstant(VT.isInteger() ? 4 : 8, dl, MVT::i32)); // OurReg = RegSaveArea + RegConstant SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, RegConstant); // Floating types are 32 bytes into RegSaveArea if (VT.isFloatingPoint()) OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, DAG.getConstant(32, dl, MVT::i32)); // increase {f,g}pr_index by 1 (or 2 if VT is i64) SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, MVT::i32)); InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, VT.isInteger() ? VAListPtr : FprPtr, MachinePointerInfo(SV), MVT::i8); // determine if we should load from reg_save_area or overflow_area SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); // increase overflow_area by 4/8 if gpr/fpr > 8 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, DAG.getConstant(VT.isInteger() ? 4 : 8, dl, MVT::i32)); OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, OverflowAreaPlusN); InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr, MachinePointerInfo(), MVT::i32); return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo()); } SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); // We have to copy the entire va_list struct: // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, false, MachinePointerInfo(), MachinePointerInfo()); } SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { return Op.getOperand(0); } SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool isPPC64 = (PtrVT == MVT::i64); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Ty = IntPtrTy; Entry.Node = Trmp; Args.push_back(Entry); // TrampSize == (isPPC64 ? 48 : 40); Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, isPPC64 ? MVT::i64 : MVT::i32); Args.push_back(Entry); Entry.Node = FPtr; Args.push_back(Entry); Entry.Node = Nest; Args.push_back(Entry); // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.second; } SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); PPCFunctionInfo *FuncInfo = MF.getInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); SDLoc dl(Op); if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), MachinePointerInfo(SV)); } // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. // We suppose the given va_list is already allocated. // // typedef struct { // char gpr; /* index into the array of 8 GPRs // * stored in the register save area // * gpr=0 corresponds to r3, // * gpr=1 to r4, etc. // */ // char fpr; /* index into the array of 8 FPRs // * stored in the register save area // * fpr=0 corresponds to f1, // * fpr=1 to f2, etc. // */ // char *overflow_arg_area; // /* location on stack that holds // * the next overflow argument // */ // char *reg_save_area; // /* where r3:r10 and f1:f8 (if saved) // * are stored // */ // } va_list[1]; SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), PtrVT); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); uint64_t FrameOffset = PtrVT.getSizeInBits()/8; SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); uint64_t FPROffset = 1; SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); // Store first byte : number of int regs SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1), MachinePointerInfo(SV), MVT::i8); uint64_t nextOffset = FPROffset; SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), ConstFPROffset); // Store second byte : number of float regs SDValue secondStore = DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, MachinePointerInfo(SV, nextOffset), MVT::i8); nextOffset += StackOffset; nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); // Store second word : arguments given on stack SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, MachinePointerInfo(SV, nextOffset)); nextOffset += FrameOffset; nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); // Store third word : arguments given in registers return DAG.getStore(thirdStore, dl, FR, nextPtr, MachinePointerInfo(SV, nextOffset)); } #include "PPCGenCallingConv.inc" // Function whose sole purpose is to kill compiler warnings // stemming from unused functions included from PPCGenCallingConv.inc. CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; } bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { return true; } bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg ArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; const unsigned NumArgRegs = array_lengthof(ArgRegs); unsigned RegNum = State.getFirstUnallocated(ArgRegs); // Skip one register if the first unallocated register has an even register // number and there are still argument registers available which have not been // allocated yet. RegNum is actually an index into ArgRegs, which means we // need to skip a register if RegNum is odd. if (RegNum != NumArgRegs && RegNum % 2 == 1) { State.AllocateReg(ArgRegs[RegNum]); } // Always return false here, as this function only makes sure that the first // unallocated register has an odd register number and does not actually // allocate a register for the current argument. return false; } bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg ArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; const unsigned NumArgRegs = array_lengthof(ArgRegs); unsigned RegNum = State.getFirstUnallocated(ArgRegs); // If there is only one Floating-point register left we need to put both f64 // values of a split ppc_fp128 value on the stack. if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { State.AllocateReg(ArgRegs[RegNum]); } // Always return false here, as this function only makes sure that the two f64 // values a ppc_fp128 value is split into are both passed in registers or both // passed on the stack and does not actually allocate a register for the // current argument. return false; } /// FPR - The set of FP registers that should be allocated for arguments, /// on Darwin. static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; /// QFPR - The set of QPX registers that should be allocated for arguments. static const MCPhysReg QFPR[] = { PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); // Round up to multiples of the pointer size, except for array members, // which are always packed. if (!Flags.isInConsecutiveRegs()) ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } /// CalculateStackSlotAlignment - Calculates the alignment of this argument /// on the stack. static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned Align = PtrByteSize; // Altivec parameters are padded to a 16 byte boundary. if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128) Align = 16; // QPX vector types stored in double-precision are padded to a 32 byte // boundary. else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) Align = 32; // ByVal parameters are aligned as requested. if (Flags.isByVal()) { unsigned BVAlign = Flags.getByValAlign(); if (BVAlign > PtrByteSize) { if (BVAlign % PtrByteSize != 0) llvm_unreachable( "ByVal alignment is not a multiple of the pointer size"); Align = BVAlign; } } // Array members are always packed to their original alignment. if (Flags.isInConsecutiveRegs()) { // If the array member was split into multiple registers, the first // needs to be aligned to the size of the full type. (Except for // ppcf128, which is only aligned as its f64 components.) if (Flags.isSplit() && OrigVT != MVT::ppcf128) Align = OrigVT.getStoreSize(); else Align = ArgVT.getStoreSize(); } return Align; } /// CalculateStackSlotUsed - Return whether this argument will use its /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, unsigned PtrByteSize, unsigned LinkageSize, unsigned ParamAreaSize, unsigned &ArgOffset, unsigned &AvailableFPRs, unsigned &AvailableVRs, bool HasQPX) { bool UseMemory = false; // Respect alignment of argument on the stack. unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; // If there's no space left in the argument save area, we must // use memory (this check also catches zero-sized arguments). if (ArgOffset >= LinkageSize + ParamAreaSize) UseMemory = true; // Allocate argument on the stack. ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); if (Flags.isInConsecutiveRegsLast()) ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // If we overran the argument save area, we must use memory // (this check catches arguments passed partially in memory) if (ArgOffset > LinkageSize + ParamAreaSize) UseMemory = true; // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || // QPX registers overlap with the scalar FP registers. (HasQPX && (ArgVT == MVT::v4f32 || ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1))) if (AvailableFPRs > 0) { --AvailableFPRs; return false; } if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128) if (AvailableVRs > 0) { --AvailableVRs; return false; } } return UseMemory; } /// EnsureStackAlignment - Round stack frame size up from NumBytes to /// ensure minimum alignment required for target. static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, unsigned NumBytes) { unsigned TargetAlign = Lowering->getStackAlignment(); unsigned AlignMask = TargetAlign - 1; NumBytes = (NumBytes + AlignMask) & ~AlignMask; return NumBytes; } SDValue PPCTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { if (Subtarget.isSVR4ABI()) { if (Subtarget.isPPC64()) return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); else return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); } else { return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); } } SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // 32-bit SVR4 ABI Stack Frame Layout: // +-----------------------------------+ // +--> | Back chain | // | +-----------------------------------+ // | | Floating-point register save area | // | +-----------------------------------+ // | | General register save area | // | +-----------------------------------+ // | | CR save word | // | +-----------------------------------+ // | | VRSAVE save word | // | +-----------------------------------+ // | | Alignment padding | // | +-----------------------------------+ // | | Vector register save area | // | +-----------------------------------+ // | | Local variable space | // | +-----------------------------------+ // | | Parameter list area | // | +-----------------------------------+ // | | LR save word | // | +-----------------------------------+ // SP--> +--- | Back chain | // +-----------------------------------+ // // Specifications: // System V Application Binary Interface PowerPC Processor Supplement // AltiVec Technology Programming Interface Manual MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 4; // Assign locations to all of the incoming arguments. SmallVector ArgLocs; PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, PtrByteSize); if (useSoftFloat()) CCInfo.PreAnalyzeFormalArguments(Ins); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); CCInfo.clearWasPPCF128(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; // Arguments stored in registers. if (VA.isRegLoc()) { const TargetRegisterClass *RC; EVT ValVT = VA.getValVT(); switch (ValVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("ValVT not supported by formal arguments Lowering"); case MVT::i1: case MVT::i32: RC = &PPC::GPRCRegClass; break; case MVT::f32: if (Subtarget.hasP8Vector()) RC = &PPC::VSSRCRegClass; else RC = &PPC::F4RCRegClass; break; case MVT::f64: if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else RC = &PPC::F8RCRegClass; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: RC = &PPC::VRRCRegClass; break; case MVT::v4f32: RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VSHRCRegClass; break; case MVT::v4f64: RC = &PPC::QFRCRegClass; break; case MVT::v4i1: RC = &PPC::QBRCRegClass; break; } // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT == MVT::i1 ? MVT::i32 : ValVT); if (ValVT == MVT::i1) ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); InVals.push_back(ArgValue); } else { // Argument stored in memory. assert(VA.isMemLoc()); unsigned ArgSize = VA.getLocVT().getStoreSize(); int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), isImmutable); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back( DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); } } // Assign locations to all of the incoming aggregate by value arguments. // Aggregates passed by value are stored in the local variable space of the // caller's stack frame, right above the parameter list area. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); // Area that is at least reserved in the caller of this function. unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); MinReservedArea = std::max(MinReservedArea, LinkageSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized function's reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. MinReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); SmallVector MemOps; // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { static const MCPhysReg GPArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); static const MCPhysReg FPArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; unsigned NumFPArgRegs = array_lengthof(FPArgRegs); if (useSoftFloat()) NumFPArgRegs = 0; FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); // Make room for NumGPArgRegs and NumFPArgRegs. int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; FuncInfo->setVarArgsStackOffset( MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, CCInfo.getNextStackOffset(), true)); FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // The fixed integer arguments of a variadic function are stored to the // VarArgsFrameIndex on the stack so that they may be loaded by // dereferencing the result of va_next. for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { // Get an existing live-in vreg, or add a new one. unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); if (!VReg) VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by four for the next argument to store SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 // is set. // The double arguments are stored to the VarArgsFrameIndex // on the stack. for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { // Get an existing live-in vreg, or add a new one. unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); if (!VReg) VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by eight for the next argument to store SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return Chain; } // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, SDValue ArgVal, const SDLoc &dl) const { if (Flags.isSExt()) ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, DAG.getValueType(ObjectVT)); else if (Flags.isZExt()) ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, DAG.getValueType(ObjectVT)); return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); } SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); assert(!(CallConv == CallingConv::Fast && isVarArg) && "fastcc not supported on varargs functions"); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; static const MCPhysReg VSRH[] = { PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 }; const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area // on its stack frame. In the ELFv1 ABI, this is always the case; // in the ELFv2 ABI, it is true if this is a vararg function or if // any parameter is located in a stack slot. bool HasParameterArea = !isELFv2ABI || isVarArg; unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; unsigned NumBytes = LinkageSize; unsigned AvailableFPRs = Num_FPR_Regs; unsigned AvailableVRs = Num_VR_Regs; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (Ins[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytes, AvailableFPRs, AvailableVRs, Subtarget.hasQPX())) HasParameterArea = true; } // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; if (Ins[ArgNo].isOrigArg()) { std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[ArgNo].getOrigArgIndex(); } // We re-align the argument offset for each argument, except when using the // fast calling convention, when we need to make sure we do that only when // we'll actually use a stack slot. unsigned CurArgOffset, Align; auto ComputeArgOffset = [&]() { /* Respect alignment of argument on the stack. */ Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; CurArgOffset = ArgOffset; }; if (CallConv != CallingConv::Fast) { ComputeArgOffset(); /* Compute GPR index associated with argument offset. */ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx = std::min(GPR_idx, Num_GPR_Regs); } // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); if (CallConv == CallingConv::Fast) ComputeArgOffset(); // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // Empty aggregate parameters do not take up registers. Examples: // struct { } a; // union { } b; // int c[0]; // etc. However, we have to provide a place-holder in InVals, so // pretend we have an 8-byte item at the current address for that // purpose. if (!ObjSize) { int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); continue; } // Create a stack object covering all stack doublewords occupied // by the argument. If the argument is (fully or partially) on // the stack, or if the argument is fully in registers but the // caller has allocated the parameter save anyway, we can refer // directly to the caller's stack frame. Otherwise, create a // local copy in our own frame. int FI; if (HasParameterArea || ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true); else FI = MFI->CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); // Handle aggregates smaller than 8 bytes. if (ObjSize < PtrByteSize) { // The value of the object is its address, which differs from the // address of the enclosing doubleword on big-endian systems. SDValue Arg = FIN; if (!isLittleEndian) { SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); } InVals.push_back(Arg); if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store; if (ObjSize==1 || ObjSize==2 || ObjSize==4) { EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, MachinePointerInfo(&*FuncArg), ObjType); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area // slot. Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(&*FuncArg)); } MemOps.push_back(Store); } // Whether we copied from a register or not, advance the offset // into the parameter save area by a full doubleword. ArgOffset += PtrByteSize; continue; } // The value of the object is its address, which is the address of // its first stack doubleword. InVals.push_back(FIN); // Store whatever pieces of the object are in registers to memory. for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { if (GPR_idx == Num_GPR_Regs) break; unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Addr = FIN; if (j) { SDValue Off = DAG.getConstant(j, dl, PtrVT); Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, MachinePointerInfo(&*FuncArg, j)); MemOps.push_back(Store); ++GPR_idx; } ArgOffset += ArgSize; continue; } switch (ObjectVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unhandled argument type!"); case MVT::i1: case MVT::i32: case MVT::i64: if (Flags.isNest()) { // The 'nest' parameter, if any, is passed in R11. unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); break; } // These can be scalar arguments or elements of an integer array type // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; ArgSize = PtrByteSize; } if (CallConv != CallingConv::Fast || needsLoad) ArgOffset += 8; break; case MVT::f32: case MVT::f64: // These can be scalar arguments or elements of a float array type // passed directly. The latter are used to implement ELFv2 homogenous // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasP8Vector() ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass); else VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 // once we support fp <-> gpr moves. // This can only ever happen in the presence of f32 array types, // since otherwise we never run out of FPRs before running out // of GPRs. unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::f32) { if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, DAG.getConstant(32, dl, MVT::i32)); ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); } ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } // When passing an array of floats, the array occupies consecutive // space in the argument area; only round up to the next doubleword // at the end of the array. Otherwise, each float takes 8 bytes. if (CallConv != CallingConv::Fast || needsLoad) { ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; ArgOffset += ArgSize; if (Flags.isInConsecutiveRegsLast()) ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: case MVT::v1i128: if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++VR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) ArgOffset += 16; break; } // not QPX assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && "Invalid QPX parameter type"); /* fall through */ case MVT::v4f64: case MVT::v4i1: // QPX vectors are treated like their scalar floating-point subregisters // (except that they're larger). unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; if (QFPR_idx != Num_QFPR_Regs) { const TargetRegisterClass *RC; switch (ObjectVT.getSimpleVT().SimpleTy) { case MVT::v4f64: RC = &PPC::QFRCRegClass; break; case MVT::v4f32: RC = &PPC::QSRCRegClass; break; default: RC = &PPC::QBRCRegClass; break; } unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++QFPR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) ArgOffset += Sz; break; } // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type. if (needsLoad) { if (ObjSize < ArgSize && !isLittleEndian) CurArgOffset += ArgSize - ObjSize; int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); } InVals.push_back(ArgVal); } // Area that is at least reserved in the caller of this function. unsigned MinReservedArea; if (HasParameterArea) MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); else MinReservedArea = LinkageSize; // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. MinReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { int Depth = ArgOffset; FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(PtrByteSize, Depth, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx < Num_GPR_Regs; ++GPR_idx) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by four for the next argument to store SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return Chain; } SDValue PPCTargetLowering::LowerFormalArguments_Darwin( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; static const MCPhysReg GPR_32[] = { // 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; static const MCPhysReg GPR_64[] = { // 64-bit registers. PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned Num_GPR_Regs = array_lengthof(GPR_32); const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof( VR); unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; // In 32-bit non-varargs functions, the stack space for vectors is after the // stack space for non-vectors. We do not use this space unless we have // too many vectors to fit in registers, something that only occurs in // constructed examples:), but we have to walk the arglist to figure // that out...for the pathological case, compute VecArgOffset as the // start of the vector parameter area. Computing VecArgOffset is the // entire point of the following loop. unsigned VecArgOffset = ArgOffset; if (!isVarArg && !isPPC64) { for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { EVT ObjectVT = Ins[ArgNo].VT; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; if (Flags.isByVal()) { // ObjSize is the true size, ArgSize rounded up to multiple of regs. unsigned ObjSize = Flags.getByValSize(); unsigned ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; VecArgOffset += ArgSize; continue; } switch(ObjectVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unhandled argument type!"); case MVT::i1: case MVT::i32: case MVT::f32: VecArgOffset += 4; break; case MVT::i64: // PPC64 case MVT::f64: // FIXME: We are guaranteed to be !isPPC64 at this point. // Does MVT::i64 apply? VecArgOffset += 8; break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: // Nothing to do, we're only looking at Nonvector args here. break; } } } // We've found where the vector parameter area in memory is. Skip the // first 12 parameters; these don't use that memory. VecArgOffset = ((VecArgOffset+15)/16)*16; VecArgOffset += 12*16; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. SmallVector MemOps; unsigned nAltivecParamsAtEnd = 0; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; unsigned ObjSize = ObjectVT.getSizeInBits()/8; unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; if (Ins[ArgNo].isOrigArg()) { std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[ArgNo].getOrigArgIndex(); } unsigned CurArgOffset = ArgOffset; // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { if (isVarArg || isPPC64) { MinReservedArea = ((MinReservedArea+15)/16)*16; MinReservedArea += CalculateStackSlotSize(ObjectVT, Flags, PtrByteSize); } else nAltivecParamsAtEnd++; } else // Calculate min reserved area. MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, Flags, PtrByteSize); // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // Objects of size 1 and 2 are right justified, everything else is // left justified. This means the memory address is adjusted forwards. if (ObjSize==1 || ObjSize==2) { CurArgOffset = CurArgOffset + (4 - ObjSize); } // The value of the object is its address. int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); if (ObjSize==1 || ObjSize==2) { if (GPR_idx != Num_GPR_Regs) { unsigned VReg; if (isPPC64) VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(&*FuncArg), ObjType); MemOps.push_back(Store); ++GPR_idx; } ArgOffset += PtrByteSize; continue; } for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { // Store whatever pieces of the object are in registers // to memory. ArgOffset will be the address of the beginning // of the object. if (GPR_idx != Num_GPR_Regs) { unsigned VReg; if (isPPC64) VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(&*FuncArg, j)); MemOps.push_back(Store); ++GPR_idx; ArgOffset += PtrByteSize; } else { ArgOffset += ArgSize - (ArgOffset-CurArgOffset); break; } } continue; } switch (ObjectVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unhandled argument type!"); case MVT::i1: case MVT::i32: if (!isPPC64) { if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); if (ObjectVT == MVT::i1) ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; } // All int arguments reserve stack space in the Darwin ABI. ArgOffset += PtrByteSize; break; } // FALLTHROUGH case MVT::i64: // PPC64 if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; } // All int arguments reserve stack space in the Darwin ABI. ArgOffset += 8; break; case MVT::f32: case MVT::f64: // Every 4 bytes of argument space consumes one of the GPRs available for // argument passing. if (GPR_idx != Num_GPR_Regs) { ++GPR_idx; if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) ++GPR_idx; } if (FPR_idx != Num_FPR_Regs) { unsigned VReg; if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); else VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; } else { needsLoad = true; } // All FP arguments reserve stack space in the Darwin ABI. ArgOffset += isPPC64 ? 8 : ObjSize; break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: // Note that vector arguments in registers don't reserve stack space, // except in varargs functions. if (VR_idx != Num_VR_Regs) { unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); if (isVarArg) { while ((ArgOffset % 16) != 0) { ArgOffset += PtrByteSize; if (GPR_idx != Num_GPR_Regs) GPR_idx++; } ArgOffset += 16; GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? } ++VR_idx; } else { if (!isVarArg && !isPPC64) { // Vectors go after all the nonvectors. CurArgOffset = VecArgOffset; VecArgOffset += 16; } else { // Vectors are aligned. ArgOffset = ((ArgOffset+15)/16)*16; CurArgOffset = ArgOffset; ArgOffset += 16; } needsLoad = true; } break; } // We need to load the argument to a virtual register if we determined above // that we ran out of physical registers of the appropriate type. if (needsLoad) { int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset + (ArgSize - ObjSize), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); } InVals.push_back(ArgVal); } // Allow for Altivec parameters at the end, if needed. if (nAltivecParamsAtEnd) { MinReservedArea = ((MinReservedArea+15)/16)*16; MinReservedArea += 16*nAltivecParamsAtEnd; } // Area that is at least reserved in the caller of this function. MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. MinReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { int Depth = ArgOffset; FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, Depth, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { unsigned VReg; if (isPPC64) VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by four for the next argument to store SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return Chain; } /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be /// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, unsigned ParamSize) { if (!isTailCall) return 0; PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo(); unsigned CallerMinReservedArea = FI->getMinReservedArea(); int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; // Remember only if the new adjustement is bigger. if (SPDiff < FI->getTailCallSPDelta()) FI->setTailCallSPDelta(SPDiff); return SPDiff; } static bool isFunctionGlobalAddress(SDValue Callee); static bool resideInSameModule(SDValue Callee, Reloc::Model RelMod) { // If !G, Callee can be an external symbol. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G) return false; const GlobalValue *GV = G->getGlobal(); if (GV->isDeclaration()) return false; switch(GV->getLinkage()) { default: llvm_unreachable("unknow linkage type"); case GlobalValue::AvailableExternallyLinkage: case GlobalValue::ExternalWeakLinkage: return false; // Callee with weak linkage is allowed if it has hidden or protected // visibility case GlobalValue::LinkOnceAnyLinkage: case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions case GlobalValue::WeakAnyLinkage: case GlobalValue::WeakODRLinkage: // e.g. c++ template instantiation if (GV->hasDefaultVisibility()) return false; case GlobalValue::ExternalLinkage: case GlobalValue::InternalLinkage: case GlobalValue::PrivateLinkage: break; } // With '-fPIC', calling default visiblity function need insert 'nop' after // function call, no matter that function resides in same module or not, so // we treat it as in different module. if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility()) return false; return true; } static bool needStackSlotPassParameters(const PPCSubtarget &Subtarget, const SmallVectorImpl &Outs) { assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); const unsigned PtrByteSize = 8; const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); const unsigned ParamAreaSize = NumGPRs * PtrByteSize; unsigned NumBytes = LinkageSize; unsigned AvailableFPRs = NumFPRs; unsigned AvailableVRs = NumVRs; for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytes, AvailableFPRs, AvailableVRs, Subtarget.hasQPX())) return true; } return false; } static bool hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { if (CS->arg_size() != CallerFn->getArgumentList().size()) return false; ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end(); Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { const Value* CalleeArg = *CalleeArgIter; const Value* CallerArg = &(*CallerArgIter); if (CalleeArg == CallerArg) continue; // e.g. @caller([4 x i64] %a, [4 x i64] %b) { // tail call @callee([4 x i64] undef, [4 x i64] %b) // } // 1st argument of callee is undef and has the same type as caller. if (CalleeArg->getType() == CallerArg->getType() && isa(CalleeArg)) continue; return false; } return true; } bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( SDValue Callee, CallingConv::ID CalleeCC, ImmutableCallSite *CS, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; if (DisableSCO && !TailCallOpt) return false; // Variadic argument functions are not supported. if (isVarArg) return false; MachineFunction &MF = DAG.getMachineFunction(); CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has // the same calling convention if (CallerCC != CalleeCC) return false; // SCO support C calling convention if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) return false; - // Functions containing by val parameters are not supported. + // Caller contains any byval parameter is not supported. if (std::any_of(Ins.begin(), Ins.end(), [](const ISD::InputArg& IA) { return IA.Flags.isByVal(); })) + return false; + + // Callee contains any byval parameter is not supported, too. + // Note: This is a quick work around, because in some cases, e.g. + // caller's stack size > callee's stack size, we are still able to apply + // sibling call optimization. See: https://reviews.llvm.org/D23441#513574 + if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) return false; // No TCO/SCO on indirect call because Caller have to restore its TOC if (!isFunctionGlobalAddress(Callee) && !isa(Callee)) return false; // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another // module. // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel())) return false; // TCO allows altering callee ABI, so we don't have to check further. if (CalleeCC == CallingConv::Fast && TailCallOpt) return true; if (DisableSCO) return false; // If callee use the same argument list that caller is using, then we can // apply SCO on this case. If it is not, then we need to check if callee needs // stack for passing arguments. if (!hasSameArgumentList(MF.getFunction(), CS) && needStackSlotPassParameters(Subtarget, Outs)) { return false; } return true; } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { if (!getTargetMachine().Options.GuaranteedTailCallOpt) return false; // Variable argument functions are not supported. if (isVarArg) return false; MachineFunction &MF = DAG.getMachineFunction(); CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { // Functions containing by val parameters are not supported. for (unsigned i = 0; i != Ins.size(); i++) { ISD::ArgFlagsTy Flags = Ins[i].Flags; if (Flags.isByVal()) return false; } // Non-PIC/GOT tail calls are supported. if (getTargetMachine().getRelocationModel() != Reloc::PIC_) return true; // At the moment we can only do local tail calls (in same module, hidden // or protected) if we are generating PIC. if (GlobalAddressSDNode *G = dyn_cast(Callee)) return G->getGlobal()->hasHiddenVisibility() || G->getGlobal()->hasProtectedVisibility(); } return false; } /// isCallCompatibleAddress - Return the immediate to use if the specified /// 32-bit value is representable in the immediate field of a BxA instruction. static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { ConstantSDNode *C = dyn_cast(Op); if (!C) return nullptr; int Addr = C->getZExtValue(); if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. SignExtend32<26>(Addr) != Addr) return nullptr; // Top 6 bits have to be sext of immediate. return DAG .getConstant( (int)C->getZExtValue() >> 2, SDLoc(Op), DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())) .getNode(); } namespace { struct TailCallArgumentInfo { SDValue Arg; SDValue FrameIdxOp; int FrameIdx; TailCallArgumentInfo() : FrameIdx(0) {} }; } /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. static void StoreTailCallArgumentsToStackSlot( SelectionDAG &DAG, SDValue Chain, const SmallVectorImpl &TailCallArgs, SmallVectorImpl &MemOpChains, const SDLoc &dl) { for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { SDValue Arg = TailCallArgs[i].Arg; SDValue FIN = TailCallArgs[i].FrameIdxOp; int FI = TailCallArgs[i].FrameIdx; // Store relative to framepointer. MemOpChains.push_back(DAG.getStore( Chain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); } } /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to /// the appropriate stack slot for the tail call optimized function call. static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, SDValue OldRetAddr, SDValue OldFP, int SPDiff, const SDLoc &dl) { if (SPDiff) { // Calculate the new stack slot for the return address. MachineFunction &MF = DAG.getMachineFunction(); const PPCSubtarget &Subtarget = MF.getSubtarget(); const PPCFrameLowering *FL = Subtarget.getFrameLowering(); bool isPPC64 = Subtarget.isPPC64(); int SlotSize = isPPC64 ? 8 : 4; int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewRetAddrLoc, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(MF, NewRetAddr)); // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack // slot as the FP is never overwritten. if (Subtarget.isDarwinABI()) { int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc, true); SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewFPIdx)); } } return Chain; } /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate /// the position of the argument. static void CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, SDValue Arg, int SPDiff, unsigned ArgOffset, SmallVectorImpl& TailCallArguments) { int Offset = ArgOffset + SPDiff; uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8; int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; SDValue FIN = DAG.getFrameIndex(FI, VT); TailCallArgumentInfo Info; Info.Arg = Arg; Info.FrameIdxOp = FIN; Info.FrameIdx = FI; TailCallArguments.push_back(Info); } /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address /// stack slot. Returns the chain as result and the loaded frame pointers in /// LROpOut/FPOpout. Used when tail calling. SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut, SDValue &FPOpOut, const SDLoc &dl) const { if (SPDiff) { // Load the LR and FP stack slot for later adjusting. EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; LROpOut = getReturnAddrFrameIndex(DAG); LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); Chain = SDValue(LROpOut.getNode(), 1); // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack // slot as the FP is never overwritten. if (Subtarget.isDarwinABI()) { FPOpOut = getFramePointerFrameIndex(DAG); FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo()); Chain = SDValue(FPOpOut.getNode(), 1); } } return Chain; } /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified /// by "Src" to address "Dst" of size "Size". Alignment information is /// specified by the specific parameter attribute. The copy will be passed as /// a byval function parameter. /// Sometimes what we are copying is the end of a larger object, the part that /// does not fit in registers. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), false, false, false, MachinePointerInfo(), MachinePointerInfo()); } /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of /// tail calls. static void LowerMemOpCallTo( SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, bool isTailCall, bool isVector, SmallVectorImpl &MemOpChains, SmallVectorImpl &TailCallArguments, const SDLoc &dl) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); if (!isTailCall) { if (isVector) { SDValue StackPtr; if (isPPC64) StackPtr = DAG.getRegister(PPC::X1, MVT::i64); else StackPtr = DAG.getRegister(PPC::R1, MVT::i32); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, DAG.getConstant(ArgOffset, dl, PtrVT)); } MemOpChains.push_back( DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); // Calculate and remember argument location. } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, TailCallArguments); } static void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, SDValue FPOp, SmallVectorImpl &TailCallArguments) { // Emit a sequence of copyto/copyfrom virtual registers for arguments that // might overwrite each other in case of tail call optimization. SmallVector MemOpChains2; // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, MemOpChains2, dl); if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); // Emit callseq_end just before tailcall node. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Is this global address that of a function that can be called by name? (as // opposed to something that must hold a descriptor for an indirect call). static bool isFunctionGlobalAddress(SDValue Callee) { if (GlobalAddressSDNode *G = dyn_cast(Callee)) { if (Callee.getOpcode() == ISD::GlobalTLSAddress || Callee.getOpcode() == ISD::TargetGlobalTLSAddress) return false; return G->getGlobal()->getValueType()->isFunctionTy(); } return false; } static unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, bool isPatchPoint, bool hasNest, SmallVectorImpl> &RegsToPass, SmallVectorImpl &Ops, std::vector &NodeTys, ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); NodeTys.push_back(MVT::Other); // Returns a chain NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. unsigned CallOpc = PPCISD::CALL; bool needIndirectCall = true; if (!isSVR4ABI || !isPPC64) if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { // If this is an absolute destination address, use the munged value. Callee = SDValue(Dest, 0); needIndirectCall = false; } // PC-relative references to external symbols should go through $stub, unless // we're building with the leopard linker or later, which automatically // synthesizes these stubs. const TargetMachine &TM = DAG.getTarget(); const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); const GlobalValue *GV = nullptr; if (auto *G = dyn_cast(Callee)) GV = G->getGlobal(); bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; if (isFunctionGlobalAddress(Callee)) { GlobalAddressSDNode *G = cast(Callee); // A call to a TLS address is actually an indirect call to a // thread-specific pointer. unsigned OpFlags = 0; if (UsePlt) OpFlags = PPCII::MO_PLT; // If the callee is a GlobalAddress/ExternalSymbol node (quite common, // every direct call is) turn it into a TargetGlobalAddress / // TargetExternalSymbol node so that legalize doesn't hack it. Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, Callee.getValueType(), 0, OpFlags); needIndirectCall = false; } if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; if (UsePlt) OpFlags = PPCII::MO_PLT; Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), OpFlags); needIndirectCall = false; } if (isPatchPoint) { // We'll form an invalid direct call when lowering a patchpoint; the full // sequence for an indirect call is complicated, and many of the // instructions introduced might have side effects (and, thus, can't be // removed later). The call itself will be removed as soon as the // argument/return lowering is complete, so the fact that it has the wrong // kind of operands should not really matter. needIndirectCall = false; } if (needIndirectCall) { // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; if (isSVR4ABI && isPPC64 && !isELFv2ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). // The function descriptor is a three doubleword structure with the // following fields: function entry point, TOC base address and // environment pointer. // Thus for a call through a function pointer, the following actions need // to be performed: // 1. Save the TOC of the caller in the TOC save area of its stack // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). // 2. Load the address of the function entry point from the function // descriptor. // 3. Load the TOC of the callee from the function descriptor into r2. // 4. Load the environment pointer from the function descriptor into // r11. // 5. Branch to the function entry point address. // 6. On return of the callee, the TOC of the caller needs to be // restored (this is done in FinishCall()). // // The loads are scheduled at the beginning of the call sequence, and the // register copies are flagged together to ensure that no other // operations can be scheduled in between. E.g. without flagging the // copies together, a TOC access in the caller could be scheduled between // the assignment of the callee TOC and the branch to the callee, which // results in the TOC access going through the TOC of the callee instead // of going through the TOC of the caller, which leads to incorrect code. // Load the address of the function entry point from the function // descriptor. SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); if (LDChain.getValueType() == MVT::Glue) LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() ? MachineMemOperand::MOInvariant : MachineMemOperand::MONone; MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr); SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, /* Alignment = */ 8, MMOFlags); // Load environment pointer into r11. SDValue PtrOff = DAG.getIntPtrConstant(16, dl); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); SDValue LoadEnvPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), /* Alignment = */ 8, MMOFlags); SDValue TOCOff = DAG.getIntPtrConstant(8, dl); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); SDValue TOCPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), /* Alignment = */ 8, MMOFlags); setUsesTOCBasePtr(DAG); SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, InFlag); Chain = TOCVal.getValue(0); InFlag = TOCVal.getValue(1); // If the function call has an explicit 'nest' parameter, it takes the // place of the environment pointer. if (!hasNest) { SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, InFlag); Chain = EnvVal.getValue(0); InFlag = EnvVal.getValue(1); } MTCTROps[0] = Chain; MTCTROps[1] = LoadFuncPtr; MTCTROps[2] = InFlag; } Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); InFlag = Chain.getValue(1); NodeTys.clear(); NodeTys.push_back(MVT::Other); NodeTys.push_back(MVT::Glue); Ops.push_back(Chain); CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); } // If this is a direct call, pass the chain and the callee. if (Callee.getNode()) { Ops.push_back(Chain); Ops.push_back(Callee); } // If this is a tail call add stack pointer delta. if (isTailCall) Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live // into the call. if (isSVR4ABI && isPPC64 && !isPatchPoint) { setUsesTOCBasePtr(DAG); Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); } return CallOpc; } static bool isLocalCall(const SDValue &Callee) { if (GlobalAddressSDNode *G = dyn_cast(Callee)) return G->getGlobal()->isStrongDefinitionForLinker(); return false; } SDValue PPCTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { SmallVector RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::AExt: Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); break; case CCValAssign::ZExt: Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, DAG.getValueType(VA.getValVT())); Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); break; case CCValAssign::SExt: Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, DAG.getValueType(VA.getValVT())); Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } SDValue PPCTargetLowering::FinishCall( CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, bool isPatchPoint, bool hasNest, SelectionDAG &DAG, SmallVector, 8> &RegsToPass, SDValue InFlag, SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals, ImmutableCallSite *CS) const { std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, SPDiff, isTailCall, isPatchPoint, hasNest, RegsToPass, Ops, NodeTys, CS, Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); // When performing tail call optimization the callee pops its arguments off // the stack. Account for this here so these bytes can be pushed back on in // PPCFrameLowering::eliminateCallFramePseudoInstr. int BytesCalleePops = (CallConv == CallingConv::Fast && getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); // Emit tail call. if (isTailCall) { assert(((Callee.getOpcode() == ISD::Register && cast(Callee)->getReg() == PPC::CTR) || Callee.getOpcode() == ISD::TargetExternalSymbol || Callee.getOpcode() == ISD::TargetGlobalAddress || isa(Callee)) && "Expecting an global address, external symbol, absolute value or register"); DAG.getMachineFunction().getFrameInfo()->setHasTailCall(); return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); } // Add a NOP immediately after the branch instruction when using the 64-bit // SVR4 ABI. At link time, if caller and callee are in a different module and // thus have a different TOC, the call will be replaced with a call to a stub // function which saves the current TOC, loads the TOC of the callee and // branches to the callee. The NOP will be replaced with a load instruction // which restores the TOC of the caller from the TOC save slot of the current // stack frame. If caller and callee belong to the same module (and have the // same TOC), the NOP will remain unchanged. if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && !isPatchPoint) { if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. // We are using a target-specific load with r2 hard coded, because the // result of a target-independent load would never go directly into r2, // since r2 is a reserved register (which prevents the register allocator // from allocating it), resulting in an additional register being // allocated and an unnecessary move instruction being generated. CallOpc = PPCISD::BCTRL_LOAD_TOC; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); // The address needs to go after the chain input but before the flag (or // any other variadic arguments). Ops.insert(std::next(Ops.begin()), AddTOC); } else if ((CallOpc == PPCISD::CALL) && (!isLocalCall(Callee) || DAG.getTarget().getRelocationModel() == Reloc::PIC_)) // Otherwise insert NOP for non-local calls. CallOpc = PPCISD::CALL_NOP; } Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(BytesCalleePops, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals); } SDValue PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; bool isPatchPoint = CLI.IsPatchPoint; ImmutableCallSite *CS = CLI.CS; if (isTailCall) { if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) isTailCall = IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, isVarArg, Outs, Ins, DAG); else isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, Ins, DAG); if (isTailCall) { ++NumTailCalls; if (!getTargetMachine().Options.GuaranteedTailCallOpt) ++NumSiblingCalls; assert(isa(Callee) && "Callee should be an llvm::Function object."); DEBUG( const GlobalValue *GV = cast(Callee)->getGlobal(); const unsigned Width = 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0"); dbgs() << "TCO caller: " << left_justify(DAG.getMachineFunction().getName(), Width) << ", callee linkage: " << GV->getVisibility() << ", " << GV->getLinkage() << "\n" ); } } if (!isTailCall && CS && CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); if (Subtarget.isSVR4ABI()) { if (Subtarget.isPPC64()) return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals, CS); else return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals, CS); } return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals, CS); } SDValue PPCTargetLowering::LowerCall_32SVR4( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite *CS) const { // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description // of the 32-bit SVR4 ABI stack frame layout. assert((CallConv == CallingConv::C || CallConv == CallingConv::Fast) && "Unknown calling convention!"); unsigned PtrByteSize = 4; MachineFunction &MF = DAG.getMachineFunction(); // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is // done because by tail calling the called function might overwrite the value // in this function's (MF) stack pointer stack slot 0(SP). if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); // Count how many bytes are to be pushed on the stack, including the linkage // area, parameter list area and the part of the local variable space which // contains copies of aggregates which are passed by value. // Assign locations to all of the outgoing arguments. SmallVector ArgLocs; PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), PtrByteSize); if (useSoftFloat()) CCInfo.PreAnalyzeCallOperands(Outs); if (isVarArg) { // Handle fixed and variable vector arguments differently. // Fixed vector arguments go into registers as long as registers are // available. Variable vector arguments always go into memory. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; bool Result; if (Outs[i].IsFixed) { Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } else { Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } if (Result) { #ifndef NDEBUG errs() << "Call operand #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"; #endif llvm_unreachable(nullptr); } } } else { // All arguments are treated the same. CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); } CCInfo.clearWasPPCF128(); // Assign locations to all of the outgoing aggregate by value arguments. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); // Size of the linkage area, parameter list area and the part of the local // space variable where copies of aggregates which are passed by value are // stored. unsigned NumBytes = CCByValInfo.getNextStackOffset(); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else // later. SDValue LROp, FPOp; Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); SmallVector, 8> RegsToPass; SmallVector TailCallArguments; SmallVector MemOpChains; bool seenFloatArg = false; // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (Flags.isByVal()) { // Argument is an aggregate which is passed by value, thus we need to // create a copy of it in the local variable space of the current stack // frame (which is the stack frame of the caller) and pass the address of // this copy to the callee. assert((j < ByValArgLocs.size()) && "Index out of bounds!"); CCValAssign &ByValVA = ByValArgLocs[j++]; assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); // Memory reserved in the local variable space of the callers stack frame. unsigned LocMemOffset = ByValVA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), StackPtr, PtrOff); // Create a copy of the argument in the local area of the current // stack frame. SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // This must go outside the CALLSEQ_START..END. SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, CallSeqStart.getNode()->getOperand(1), SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; // Pass the address of the aggregate copy on the stack either in a // physical register or in the parameter list area of the current stack // frame to the callee. Arg = PtrOff; } if (VA.isRegLoc()) { if (Arg.getValueType() == MVT::i1) Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); seenFloatArg |= VA.getLocVT().isFloatingPoint(); // Put argument in a physical register. RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { // Put argument in the parameter list area of the current stack frame. assert(VA.isMemLoc()); unsigned LocMemOffset = VA.getLocMemOffset(); if (!isTailCall) { SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), StackPtr, PtrOff); MemOpChains.push_back( DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); } else { // Calculate and remember argument location. CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, TailCallArguments); } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } // Set CR bit 6 to true if this is a vararg call with floating args passed in // registers. if (isVarArg) { SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, InFlag }; Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); InFlag = Chain.getValue(1); } if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } // Copy an argument into memory, being careful to do this outside the // call sequence for the call to which the argument belongs. SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) const { SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, CallSeqStart.getNode()->getOperand(1), SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; } SDValue PPCTargetLowering::LowerCall_64SVR4( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite *CS) const { bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); bool hasNest = false; bool IsSibCall = false; EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned PtrByteSize = 8; MachineFunction &MF = DAG.getMachineFunction(); if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) IsSibCall = true; // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is // done because by tail calling the called function might overwrite the value // in this function's (MF) stack pointer stack slot 0(SP). if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); assert(!(CallConv == CallingConv::Fast && isVarArg) && "fastcc not supported on varargs functions"); // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage // area is 32 bytes reserved space for [SP][CR][LR][TOC]. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; static const MCPhysReg VSRH[] = { PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 }; const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); const unsigned NumQFPRs = NumFPRs; // When using the fast calling convention, we don't provide backing for // arguments that will be in registers. unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; // Add up all the space actually used. for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; if (Flags.isNest()) continue; if (CallConv == CallingConv::Fast) { if (Flags.isByVal()) NumGPRsUsed += (Flags.getByValSize()+7)/8; else switch (ArgVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected ValueType for argument!"); case MVT::i1: case MVT::i32: case MVT::i64: if (++NumGPRsUsed <= NumGPRs) continue; break; case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: case MVT::v1i128: if (++NumVRsUsed <= NumVRs) continue; break; case MVT::v4f32: // When using QPX, this is handled like a FP register, otherwise, it // is an Altivec register. if (Subtarget.hasQPX()) { if (++NumFPRsUsed <= NumFPRs) continue; } else { if (++NumVRsUsed <= NumVRs) continue; } break; case MVT::f32: case MVT::f64: case MVT::v4f64: // QPX case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; } } /* Respect alignment of argument on the stack. */ unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); NumBytes = ((NumBytes + Align - 1) / Align) * Align; NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); if (Flags.isInConsecutiveRegsLast()) NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } unsigned NumBytesActuallyUsed = NumBytes; // The prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); int SPDiff = 0; // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. if (!IsSibCall) SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // To protect arguments on the stack from being clobbered in a tail call, // force all the loads to happen before doing any other lowering. if (isTailCall) Chain = DAG.getStackArgumentTokenFactor(Chain); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else // later. SDValue LROp, FPOp; Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); // Figure out which arguments are going to go in registers, and which in // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. unsigned ArgOffset = LinkageSize; SmallVector, 8> RegsToPass; SmallVector TailCallArguments; SmallVector MemOpChains; for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; // We re-align the argument offset for each argument, except when using the // fast calling convention, when we need to make sure we do that only when // we'll actually use a stack slot. auto ComputePtrOff = [&]() { /* Respect alignment of argument on the stack. */ unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); }; if (CallConv != CallingConv::Fast) { ComputePtrOff(); /* Compute GPR index associated with argument offset. */ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx = std::min(GPR_idx, NumGPRs); } // Promote integers to 64-bit values. if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { // FIXME: Should this use ANY_EXTEND if neither sext nor zext? unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); } // FIXME memcpy is used way more than necessary. Correctness first. // Note: "by value" is code for passing a structure by value, not // basic types. if (Flags.isByVal()) { // Note: Size includes alignment padding, so // struct x { short a; char b; } // will have Size = 4. With #pragma pack(1), it will have Size = 3. // These are the proper values we need for right-justifying the // aggregate in a parameter register. unsigned Size = Flags.getByValSize(); // An empty aggregate parameter takes up no storage and no // registers. if (Size == 0) continue; if (CallConv == CallingConv::Fast) ComputePtrOff(); // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; continue; } } if (GPR_idx == NumGPRs && Size < 8) { SDValue AddPtr = PtrOff; if (!isLittleEndian) { SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, PtrOff.getValueType()); AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); ArgOffset += PtrByteSize; continue; } // Copy entire object into memory. There are cases where gcc-generated // code assumes it is there, even if it could be put entirely into // registers. (This is not what the doc says.) // FIXME: The above statement is likely due to a misunderstanding of the // documents. All arguments must be copied into the parameter area BY // THE CALLEE in the event that the callee takes the address of any // formal argument. That has not yet been implemented. However, it is // reasonable to use the stack area as a staging area for the register // load. // Skip this for small aggregates, as we will use the same slot for a // right-justified copy, below. if (Size >= 8) Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, CallSeqStart, Flags, DAG, dl); // When a register is available, pass a small aggregate right-justified. if (Size < 8 && GPR_idx != NumGPRs) { // The easiest way to get this right-justified in a register // is to copy the structure into the rightmost portion of a // local variable slot, then load the whole slot into the // register. // FIXME: The memcpy seems to produce pretty awful code for // small aggregates, particularly for packed ones. // FIXME: It would be preferable to use the slot in the // parameter save area instead of a new local variable. SDValue AddPtr = PtrOff; if (!isLittleEndian) { SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); // Load the slot into the register. SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); // Done with this argument. ArgOffset += PtrByteSize; continue; } // For aggregates larger than PtrByteSize, copy the pieces of the // object that fit into registers from the parameter save area. for (unsigned j=0; j gpr moves. // In the non-vararg case, this can only ever happen in the // presence of f32 array types, since otherwise we never run // out of FPRs before running out of GPRs. SDValue ArgVal; // Double values are always passed in a single GPR. if (Arg.getValueType() != MVT::f32) { ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); // Non-array float values are extended and passed in a GPR. } else if (!Flags.isInConsecutiveRegs()) { ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); // If we have an array of floats, we collect every odd element // together with its predecessor into one GPR. } else if (ArgOffset % PtrByteSize != 0) { SDValue Lo, Hi; Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); if (!isLittleEndian) std::swap(Lo, Hi); ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); // The final element, if even, goes into the first half of a GPR. } else if (Flags.isInConsecutiveRegsLast()) { ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); if (!isLittleEndian) ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, DAG.getConstant(32, dl, MVT::i32)); // Non-final even elements are skipped; they will be handled // together the with subsequent argument on the next go-around. } else ArgVal = SDValue(); if (ArgVal.getNode()) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); } else { if (CallConv == CallingConv::Fast) ComputePtrOff(); // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. if (Arg.getValueType() == MVT::f32 && !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); NeededLoad = true; } // When passing an array of floats, the array occupies consecutive // space in the argument area; only round up to the next doubleword // at the end of the array. Otherwise, each float takes 8 bytes. if (CallConv != CallingConv::Fast || NeededLoad) { ArgOffset += (Arg.getValueType() == MVT::f32 && Flags.isInConsecutiveRegs()) ? 4 : 8; if (Flags.isInConsecutiveRegsLast()) ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } break; } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: case MVT::v1i128: if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. // For a varargs call, named arguments go into VRs or on the stack as // usual; unnamed arguments always go to the stack or the corresponding // GPRs when within range. For now, we always put the value in both // locations (or even all three). if (isVarArg) { // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Store); if (VR_idx != NumVRs) { SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || Arg.getSimpleValueType() == MVT::v2i64) ? VSRH[VR_idx] : VR[VR_idx]; ++VR_idx; RegsToPass.push_back(std::make_pair(VReg, Load)); } ArgOffset += 16; for (unsigned i=0; i<16; i+=PtrByteSize) { if (GPR_idx == NumGPRs) break; SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, DAG.getConstant(i, dl, PtrVT)); SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); } break; } // Non-varargs Altivec params go into VRs or on the stack. if (VR_idx != NumVRs) { unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || Arg.getSimpleValueType() == MVT::v2i64) ? VSRH[VR_idx] : VR[VR_idx]; ++VR_idx; RegsToPass.push_back(std::make_pair(VReg, Arg)); } else { if (CallConv == CallingConv::Fast) ComputePtrOff(); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); if (CallConv == CallingConv::Fast) ArgOffset += 16; } if (CallConv != CallingConv::Fast) ArgOffset += 16; break; } // not QPX assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && "Invalid QPX parameter type"); /* fall through */ case MVT::v4f64: case MVT::v4i1: { bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; if (isVarArg) { // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Store); if (QFPR_idx != NumQFPRs) { SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); } ArgOffset += (IsF32 ? 16 : 32); for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { if (GPR_idx == NumGPRs) break; SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, DAG.getConstant(i, dl, PtrVT)); SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); } break; } // Non-varargs QPX params go into registers or on the stack. if (QFPR_idx != NumQFPRs) { RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); } else { if (CallConv == CallingConv::Fast) ComputePtrOff(); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); if (CallConv == CallingConv::Fast) ArgOffset += (IsF32 ? 16 : 32); } if (CallConv != CallingConv::Fast) ArgOffset += (IsF32 ? 16 : 32); break; } } } assert(NumBytesActuallyUsed == ArgOffset); (void)NumBytesActuallyUsed; if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Check if this is an indirect call (MTCTR/BCTRL). // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. if (!isTailCall && !isPatchPoint && !isFunctionGlobalAddress(Callee) && !isa(Callee)) { // Load r2 into a virtual register and store it to the TOC save area. setUsesTOCBasePtr(DAG); SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore( Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); // In the ELFv2 ABI, R12 must contain the address of an indirect callee. // This does not mean the MTCTR instruction must use R12; it's easier // to model this as an extra parameter, so do that. if (isELFv2ABI && !isPatchPoint) RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (isTailCall && !IsSibCall) PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } SDValue PPCTargetLowering::LowerCall_Darwin( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite *CS) const { unsigned NumOps = Outs.size(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; unsigned PtrByteSize = isPPC64 ? 8 : 4; MachineFunction &MF = DAG.getMachineFunction(); // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is // done because by tail calling the called function might overwrite the value // in this function's (MF) stack pointer stack slot 0(SP). if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; // Add up all the space actually used. // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually // they all go in registers, but we must reserve stack space for them for // possible use by the caller. In varargs or 64-bit calls, parameters are // assigned stack space in order, with padding so Altivec parameters are // 16-byte aligned. unsigned nAltivecParamsAtEnd = 0; for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; // Varargs Altivec parameters are padded to a 16 byte boundary. if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { if (!isVarArg && !isPPC64) { // Non-varargs Altivec parameters go after all the non-Altivec // parameters; handle those later so we know how much padding we need. nAltivecParamsAtEnd++; continue; } // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. NumBytes = ((NumBytes+15)/16)*16; } NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); } // Allow for Altivec parameters at the end, if needed. if (nAltivecParamsAtEnd) { NumBytes = ((NumBytes+15)/16)*16; NumBytes += 16*nAltivecParamsAtEnd; } // The prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // To protect arguments on the stack from being clobbered in a tail call, // force all the loads to happen before doing any other lowering. if (isTailCall) Chain = DAG.getStackArgumentTokenFactor(Chain); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else // later. SDValue LROp, FPOp; Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. SDValue StackPtr; if (isPPC64) StackPtr = DAG.getRegister(PPC::X1, MVT::i64); else StackPtr = DAG.getRegister(PPC::R1, MVT::i32); // Figure out which arguments are going to go in registers, and which in // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR_32[] = { // 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; static const MCPhysReg GPR_64[] = { // 64-bit registers. PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned NumGPRs = array_lengthof(GPR_32); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; SmallVector, 8> RegsToPass; SmallVector TailCallArguments; SmallVector MemOpChains; for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); // On PPC64, promote integers to 64-bit values. if (isPPC64 && Arg.getValueType() == MVT::i32) { // FIXME: Should this use ANY_EXTEND if neither sext nor zext? unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); } // FIXME memcpy is used way more than necessary. Correctness first. // Note: "by value" is code for passing a structure by value, not // basic types. if (Flags.isByVal()) { unsigned Size = Flags.getByValSize(); // Very small objects are passed right-justified. Everything else is // passed left-justified. if (Size==1 || Size==2) { EVT VT = (Size==1) ? MVT::i8 : MVT::i16; if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; } else { SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, PtrOff.getValueType()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); ArgOffset += PtrByteSize; } continue; } // Copy entire object into memory. There are cases where gcc-generated // code assumes it is there, even if it could be put entirely into // registers. (This is not what the doc says.) Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, CallSeqStart, Flags, DAG, dl); // For small aggregates (Darwin only) and aggregates >= PtrByteSize, // copy the pieces of the object that fit into registers from the // parameter save area. for (unsigned j=0; j NumVRs) { unsigned j = 0; // Offset is aligned; skip 1st 12 params which go in V registers. ArgOffset = ((ArgOffset+15)/16)*16; ArgOffset += 12*16; for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; EVT ArgType = Outs[i].VT; if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { if (++j > NumVRs) { SDValue PtrOff; // We are emitting Altivec params in order. LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, isPPC64, isTailCall, true, MemOpChains, TailCallArguments, dl); ArgOffset += 16; } } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // On Darwin, R12 must contain the address of an indirect callee. This does // not mean the MTCTR instruction must use R12; it's easier to model this as // an extra parameter, so do that. if (!isTailCall && !isFunctionGlobalAddress(Callee) && !isa(Callee) && !isBLACompatibleAddress(Callee, DAG)) RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : PPC::R12), Callee)); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } bool PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_PPC); } SDValue PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); SDValue Flag; SmallVector RetOps(1, Chain); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[i]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; } Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (PPC::G8RCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else if (PPC::F8RCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else if (PPC::CRRCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i1)); else if (PPC::VRRCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::Other)); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); } SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Get the corect type for integers. EVT IntVT = Op.getValueType(); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue FPSIdx = getFramePointerFrameIndex(DAG); // Build a DYNAREAOFFSET node. SDValue Ops[2] = {Chain, FPSIdx}; SDVTList VTs = DAG.getVTList(IntVT); return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); } SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const { // When we pop the dynamic allocation we need to restore the SP link. SDLoc dl(Op); // Get the corect type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Construct the stack pointer operand. bool isPPC64 = Subtarget.isPPC64(); unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; SDValue StackPtr = DAG.getRegister(SP, PtrVT); // Get the operands for the STACKRESTORE. SDValue Chain = Op.getOperand(0); SDValue SaveSP = Op.getOperand(1); // Load the old link SP. SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo()); // Restore the stack pointer. Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); // Store the old link SP. return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo()); } SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Get current frame pointer save index. The users of this index will be // primarily DYNALLOC instructions. PPCFunctionInfo *FI = MF.getInfo(); int RASI = FI->getReturnAddrSaveIndex(); // If the frame pointer save index hasn't been defined yet. if (!RASI) { // Find out what the fix offset of the frame pointer save area. int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); // Allocate the frame index for frame pointer save area. RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, false); // Save the result. FI->setReturnAddrSaveIndex(RASI); } return DAG.getFrameIndex(RASI, PtrVT); } SDValue PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Get current frame pointer save index. The users of this index will be // primarily DYNALLOC instructions. PPCFunctionInfo *FI = MF.getInfo(); int FPSI = FI->getFramePointerSaveIndex(); // If the frame pointer save index hasn't been defined yet. if (!FPSI) { // Find out what the fix offset of the frame pointer save area. int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); // Allocate the frame index for frame pointer save area. FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); // Save the result. FI->setFramePointerSaveIndex(FPSI); } return DAG.getFrameIndex(FPSI, PtrVT); } SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); SDLoc dl(Op); // Get the corect type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Negate the size. SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, DAG.getConstant(0, dl, PtrVT), Size); // Construct a node for the frame pointer save index. SDValue FPSIdx = getFramePointerFrameIndex(DAG); // Build a DYNALLOC node. SDValue Ops[3] = { Chain, NegSize, FPSIdx }; SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); } SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); // First, load 8 bits into 32 bits, then truncate to 1 bit. SDLoc dl(Op); LoadSDNode *LD = cast(Op); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); MachineMemOperand *MMO = LD->getMemOperand(); SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, BasePtr, MVT::i8, MMO); SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; return DAG.getMergeValues(Ops, dl); } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(1).getValueType().isVector()) return LowerVectorStore(Op, DAG); assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); // First, zero extend to 32 bits, then use a truncating store to 8 bits. SDLoc dl(Op); StoreSDNode *ST = cast(Op); SDValue Chain = ST->getChain(); SDValue BasePtr = ST->getBasePtr(); SDValue Value = ST->getValue(); MachineMemOperand *MMO = ST->getMemOperand(); Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), Value); return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); } // FIXME: Remove this once the ANDI glue bug is fixed: SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 results"); SDLoc DL(Op); return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, Op.getOperand(0)); } /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Not FP? Not a fsel. if (!Op.getOperand(0).getValueType().isFloatingPoint() || !Op.getOperand(2).getValueType().isFloatingPoint()) return Op; // We might be able to do better than this under some circumstances, but in // general, fsel-based lowering of select is a finite-math-only optimization. // For more information, see section F.3 of the 2.06 ISA specification. if (!DAG.getTarget().Options.NoInfsFPMath || !DAG.getTarget().Options.NoNaNsFPMath) return Op; // TODO: Propagate flags from the select rather than global settings. SDNodeFlags Flags; Flags.setNoInfs(true); Flags.setNoNaNs(true); ISD::CondCode CC = cast(Op.getOperand(4))->get(); EVT ResVT = Op.getValueType(); EVT CmpVT = Op.getOperand(0).getValueType(); SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); SDLoc dl(Op); // If the RHS of the comparison is a 0.0, we don't need to do the // subtraction at all. SDValue Sel1; if (isFloatingPointZero(RHS)) switch (CC) { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); case ISD::SETEQ: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); return DAG.getNode(PPCISD::FSEL, dl, ResVT, DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt case ISD::SETOGE: case ISD::SETGE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); case ISD::SETUGT: case ISD::SETGT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt case ISD::SETOLE: case ISD::SETLE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); return DAG.getNode(PPCISD::FSEL, dl, ResVT, DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); } SDValue Cmp; switch (CC) { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); case ISD::SETEQ: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); return DAG.getNode(PPCISD::FSEL, dl, ResVT, DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); } return Op; } void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, SelectionDAG &DAG, const SDLoc &dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); SDValue Tmp; switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode( Op.getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIWZ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); break; case MVT::i64: assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, dl, MVT::f64, Src); break; } // Convert the FP value to an int value through memory. bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast(FIPtr)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Emit a store to the stack slot. SDValue Chain; if (i32Stack) { MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); } else Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI); // Result is a load from the stack slot. If loading 4 bytes, make sure to // add in a bias on big endian. if (Op.getValueType() == MVT::i32 && !i32Stack) { FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, DAG.getConstant(4, dl, FIPtr.getValueType())); MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4); } RLI.Chain = Chain; RLI.Ptr = FIPtr; RLI.MPI = MPI; } /// \brief Custom lowers floating point to integer conversions to use /// the direct move instructions available in ISA 2.07 to avoid the /// need for load/store combinations. SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); SDValue Tmp; switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode( Op.getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIWZ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); break; case MVT::i64: assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, dl, MVT::f64, Src); Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); break; } return Tmp; } SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) return LowerFP_TO_INTDirectMove(Op, DAG, dl); ReuseLoadInfo RLI; LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, RLI.Alignment, RLI.IsInvariant ? MachineMemOperand::MOInvariant : MachineMemOperand::MONone, RLI.AAInfo, RLI.Ranges); } // We're trying to insert a regular store, S, and then a load, L. If the // incoming value, O, is a load, we might just be able to have our load use the // address used by O. However, we don't know if anything else will store to // that address before we can load from it. To prevent this situation, we need // to insert our load, L, into the chain as a peer of O. To do this, we give L // the same chain operand as O, we create a token factor from the chain results // of O and L, and we replace all uses of O's chain result with that token // factor (see spliceIntoChain below for this last part). bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, SelectionDAG &DAG, ISD::LoadExtType ET) const { SDLoc dl(Op); if (ET == ISD::NON_EXTLOAD && (Op.getOpcode() == ISD::FP_TO_UINT || Op.getOpcode() == ISD::FP_TO_SINT) && isOperationLegalOrCustom(Op.getOpcode(), Op.getOperand(0).getValueType())) { LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); return true; } LoadSDNode *LD = dyn_cast(Op); if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || LD->isNonTemporal()) return false; if (LD->getMemoryVT() != MemVT) return false; RLI.Ptr = LD->getBasePtr(); if (LD->isIndexed() && !LD->getOffset().isUndef()) { assert(LD->getAddressingMode() == ISD::PRE_INC && "Non-pre-inc AM on PPC?"); RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, LD->getOffset()); } RLI.Chain = LD->getChain(); RLI.MPI = LD->getPointerInfo(); RLI.IsInvariant = LD->isInvariant(); RLI.Alignment = LD->getAlignment(); RLI.AAInfo = LD->getAAInfo(); RLI.Ranges = LD->getRanges(); RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); return true; } // Given the head of the old chain, ResChain, insert a token factor containing // it and NewResChain, and make users of ResChain now be users of that token // factor. void PPCTargetLowering::spliceIntoChain(SDValue ResChain, SDValue NewResChain, SelectionDAG &DAG) const { if (!ResChain) return; SDLoc dl(NewResChain); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, NewResChain, DAG.getUNDEF(MVT::Other)); assert(TF.getNode() != NewResChain.getNode() && "A new TF really is required here"); DAG.ReplaceAllUsesOfValueWith(ResChain, TF); DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); } /// \brief Analyze profitability of direct move /// prefer float load to int load plus direct move /// when there is no integer use of int load static bool directMoveIsProfitable(const SDValue &Op) { SDNode *Origin = Op.getOperand(0).getNode(); if (Origin->getOpcode() != ISD::LOAD) return true; for (SDNode::use_iterator UI = Origin->use_begin(), UE = Origin->use_end(); UI != UE; ++UI) { // Only look at the users of the loaded value. if (UI.getUse().get().getResNo() != 0) continue; if (UI->getOpcode() != ISD::SINT_TO_FP && UI->getOpcode() != ISD::UINT_TO_FP) return true; } return false; } /// \brief Custom lowers integer to floating point conversions to use /// the direct move instructions available in ISA 2.07 to avoid the /// need for load/store combinations. SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Invalid floating point type as target of conversion"); assert(Subtarget.hasFPCVT() && "Int to FP conversions with direct moves require FPCVT"); SDValue FP; SDValue Src = Op.getOperand(0); bool SinglePrec = Op.getValueType() == MVT::f32; bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); if (WordInt) { FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, dl, MVT::f64, Src); FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); } else { FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); } return FP; } SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) return SDValue(); SDValue Value = Op.getOperand(0); // The values are now known to be -1 (false) or 1 (true). To convert this // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); if (Op.getValueType() != MVT::v4f64) Value = DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(), Value, DAG.getIntPtrConstant(1, dl)); return Value; } // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); if (Op.getOperand(0).getValueType() == MVT::i1) return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), DAG.getConstantFP(1.0, dl, Op.getValueType()), DAG.getConstantFP(0.0, dl, Op.getValueType())); // If we have direct moves, we can do all the conversion, skip the store/load // however, without FPCVT we can't do most conversions. if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) && Subtarget.isPPC64() && Subtarget.hasFPCVT()) return LowerINT_TO_FPDirectMove(Op, DAG, dl); assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS : PPCISD::FCFIDS) : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU : PPCISD::FCFID); MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? MVT::f32 : MVT::f64; if (Op.getOperand(0).getValueType() == MVT::i64) { SDValue SINT = Op.getOperand(0); // When converting to single-precision, we actually need to convert // to double-precision first and then round to single-precision. // To avoid double-rounding effects during that operation, we have // to prepare the input operand. Bits that might be truncated when // converting to double-precision are replaced by a bit that won't // be lost at this stage, but is below the single-precision rounding // position. // // However, if -enable-unsafe-fp-math is in effect, accept double // rounding to avoid the extra overhead. if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() && !DAG.getTarget().Options.UnsafeFPMath) { // Twiddle input to make sure the low 11 bits are zero. (If this // is the case, we are guaranteed the value will fit into the 53 bit // mantissa of an IEEE double-precision value without rounding.) // If any of those low 11 bits were not zero originally, make sure // bit 12 (value 2048) is set instead, so that the final rounding // to single-precision gets the correct result. SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, SINT, DAG.getConstant(2047, dl, MVT::i64)); Round = DAG.getNode(ISD::ADD, dl, MVT::i64, Round, DAG.getConstant(2047, dl, MVT::i64)); Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); Round = DAG.getNode(ISD::AND, dl, MVT::i64, Round, DAG.getConstant(-2048, dl, MVT::i64)); // However, we cannot use that value unconditionally: if the magnitude // of the input value is small, the bit-twiddling we did above might // end up visibly changing the output. Fortunately, in that case, we // don't need to twiddle bits since the original input will convert // exactly to double-precision floating-point already. Therefore, // construct a conditional to use the original value if the top 11 // bits are all sign-bit copies, and use the rounded value computed // above otherwise. SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, SINT, DAG.getConstant(53, dl, MVT::i32)); Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, Cond, DAG.getConstant(1, dl, MVT::i64)); Cond = DAG.getSetCC(dl, MVT::i32, Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); } ReuseLoadInfo RLI; SDValue Bits; MachineFunction &MF = DAG.getMachineFunction(); if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, RLI.Alignment, RLI.IsInvariant ? MachineMemOperand::MOInvariant : MachineMemOperand::MONone, RLI.AAInfo, RLI.Ranges); spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); } else if (Subtarget.hasLFIWAX() && canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); } else if (Subtarget.hasFPCVT() && canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); } else if (((Subtarget.hasLFIWAX() && SINT.getOpcode() == ISD::SIGN_EXTEND) || (Subtarget.hasFPCVT() && SINT.getOpcode() == ISD::ZERO_EXTEND)) && SINT.getOperand(0).getValueType() == MVT::i32) { MachineFrameInfo *FrameInfo = MF.getFrameInfo(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FrameIdx)); assert(cast(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; RLI.MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); } else Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); return FP; } assert(Op.getOperand(0).getValueType() == MVT::i32 && "Unhandled INT_TO_FP type in custom expander!"); // Since we only generate this in 64-bit mode, we can take advantage of // 64-bit registers. In particular, sign extend the input value into the // 64-bit register with extsw, store the WHOLE 64-bit value into the stack // then lfd it and fcfid it. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *FrameInfo = MF.getFrameInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); SDValue Ld; if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { ReuseLoadInfo RLI; bool ReusingLoad; if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, DAG))) { int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FrameIdx)); assert(cast(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; RLI.MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; } MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); if (ReusingLoad) spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); } else { assert(Subtarget.isPPC64() && "i32->FP without LFIWAX supported only on PPC64"); int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Op.getOperand(0)); // STD the extended value into the stack slot. SDValue Store = DAG.getStore( DAG.getEntryNode(), dl, Ext64, FIdx, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); // Load the value as a double. Ld = DAG.getLoad( MVT::f64, dl, Store, FIdx, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); } // FCFID it and return it. SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); return FP; } SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); /* The rounding mode is in bits 30:31 of FPSR, and has the following settings: 00 Round to nearest 01 Round to 0 10 Round to +inf 11 Round to -inf FLT_ROUNDS, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we do: ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) */ MachineFunction &MF = DAG.getMachineFunction(); EVT VT = Op.getValueType(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Save FP Control Word to register EVT NodeTys[] = { MVT::f64, // return register MVT::Glue // unused in this context }; SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); // Save FP register to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot, MachinePointerInfo()); // Load FP Control Word from low 32 bits of stack slot. SDValue Four = DAG.getConstant(4, dl, PtrVT); SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo()); // Transform as necessary SDValue CWD1 = DAG.getNode(ISD::AND, dl, MVT::i32, CWD, DAG.getConstant(3, dl, MVT::i32)); SDValue CWD2 = DAG.getNode(ISD::SRL, dl, MVT::i32, DAG.getNode(ISD::AND, dl, MVT::i32, DAG.getNode(ISD::XOR, dl, MVT::i32, CWD, DAG.getConstant(3, dl, MVT::i32)), DAG.getConstant(3, dl, MVT::i32)), DAG.getConstant(1, dl, MVT::i32)); SDValue RetVal = DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); } SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); SDLoc dl(Op); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SHL!"); // Expand into a bunch of logical ops. Note that these ops // depend on the PPC behavior for oversized shift amounts. SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Amt = Op.getOperand(2); EVT AmtVT = Amt.getValueType(); SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Amt); SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, DAG.getConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); SDValue OutOps[] = { OutLo, OutHi }; return DAG.getMergeValues(OutOps, dl); } SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); unsigned BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRL!"); // Expand into a bunch of logical ops. Note that these ops // depend on the PPC behavior for oversized shift amounts. SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Amt = Op.getOperand(2); EVT AmtVT = Amt.getValueType(); SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Amt); SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, DAG.getConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); SDValue OutOps[] = { OutLo, OutHi }; return DAG.getMergeValues(OutOps, dl); } SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRA!"); // Expand into a bunch of logical ops, followed by a select_cc. SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Amt = Op.getOperand(2); EVT AmtVT = Amt.getValueType(); SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Amt); SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, DAG.getConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), Tmp4, Tmp6, ISD::SETLE); SDValue OutOps[] = { OutLo, OutHi }; return DAG.getMergeValues(OutOps, dl); } //===----------------------------------------------------------------------===// // Vector related lowering. // /// BuildSplatI - Build a canonical splati of Val with an element size of /// SplatSize. Cast the result to VT. static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); static const MVT VTys[] = { // canonical VT to use for each size. MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 }; EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. if (Val == -1) SplatSize = 1; EVT CanonicalVT = VTys[SplatSize-1]; // Build a canonical splat for this value. return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT)); } /// BuildIntrinsicOp - Return a unary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG, const SDLoc &dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = Op.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, dl, MVT::i32), Op); } /// BuildIntrinsicOp - Return a binary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = LHS.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); } /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, SDValue Op2, SelectionDAG &DAG, const SDLoc &dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = Op0.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); } /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified /// amount. The result has the specified value type. static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, SelectionDAG &DAG, const SDLoc &dl) { // Force LHS/RHS to be the right type. LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); int Ops[16]; for (unsigned i = 0; i != 16; ++i) Ops[i] = i + Amt; SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, T); } // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen // this case more efficiently than a constant pool load, lower it to the // sequence of ops that should be used. SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { // We first build an i32 vector, load it into a QPX register, // then convert it to a floating-point vector and compare it // to a zero vector to get the boolean result. MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); assert(BVN->getNumOperands() == 4 && "BUILD_VECTOR for v4i1 does not have 4 operands"); bool IsConst = true; for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).isUndef()) continue; if (!isa(BVN->getOperand(i))) { IsConst = false; break; } } if (IsConst) { Constant *One = ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); Constant *NegOne = ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); Constant *CV[4]; for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).isUndef()) CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); else if (isNullConstant(BVN->getOperand(i))) CV[i] = NegOne; else CV[i] = One; } Constant *CP = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), 16 /* alignment */); SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); return DAG.getMemIntrinsicNode( PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } SmallVector Stores; for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).isUndef()) continue; unsigned Offset = 4*i; SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); if (StoreSize > 4) { Stores.push_back( DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, PtrInfo.getWithOffset(Offset), MVT::i32)); } else { SDValue StoreValue = BVN->getOperand(i); if (StoreSize < 4) StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, PtrInfo.getWithOffset(Offset))); } } SDValue StoreChain; if (!Stores.empty()) StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); else StoreChain = DAG.getEntryNode(); // Now load from v4i32 into the QPX register; this will extend it to // v4i64 but not yet convert it to a floating point. Nevertheless, this // is typed as v4f64 because the QPX register integer states are not // explicitly represented. SDValue Ops[] = {StoreChain, DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), FIdx}; SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, VTs, Ops, MVT::v4i32, PtrInfo); LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), LoadedVect); SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); } // All other QPX vectors are handled by generic code. if (Subtarget.hasQPX()) return SDValue(); // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || SplatBitSize > 32) return SDValue(); unsigned SplatBits = APSplatBits.getZExtValue(); unsigned SplatUndef = APSplatUndef.getZExtValue(); unsigned SplatSize = SplatBitSize / 8; // First, handle single instruction cases. // All zeros? if (SplatBits == 0) { // Canonicalize all zero vectors to be v4i32. if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { SDValue Z = DAG.getConstant(0, dl, MVT::v4i32); Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); } return Op; } // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> (32-SplatBitSize)); if (SextVal >= -16 && SextVal <= 15) return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); // Two instruction sequences. // If this value is in the range [-32,30] and is even, use: // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) // If this value is in the range [17,31] and is odd, use: // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) // If this value is in the range [-31,-17] and is odd, use: // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) // Note the last two are three-instruction sequences. if (SextVal >= -32 && SextVal <= 31) { // To avoid having these optimizations undone by constant folding, // we convert to a pseudo that will be expanded later into one of // the above forms. SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); EVT VT = (SplatSize == 1 ? MVT::v16i8 : (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); if (VT == Op.getValueType()) return RetVal; else return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); } // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important // for fneg/fabs. if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { // Make -1 and vspltisw -1: SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); // Make the VSLW intrinsic, computing 0x8000_0000. SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, OnesV, DAG, dl); // xor by OnesV to invert it. Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // Check to see if this is a wide variety of vsplti*, binop self cases. static const signed char SplatCsts[] = { -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 }; for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { // Indirect through the SplatCsts array so that we favor 'vsplti -1' for // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' int i = SplatCsts[idx]; // Figure out what shift amount will be used by altivec if shifted by i in // this splat size. unsigned TypeShiftAmt = i & (SplatBitSize-1); // vsplti + shl self. if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, Intrinsic::ppc_altivec_vslw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // vsplti + srl self. if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, Intrinsic::ppc_altivec_vsrw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // vsplti + sra self. if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, Intrinsic::ppc_altivec_vsraw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // vsplti + rol self. if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, Intrinsic::ppc_altivec_vrlw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // t = vsplti c, result = vsldoi t, t, 1 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 2 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 3 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } } return SDValue(); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VMRGHW, OP_VMRGLW, OP_VSPLTISW0, OP_VSPLTISW1, OP_VSPLTISW2, OP_VSPLTISW3, OP_VSLDOI4, OP_VSLDOI8, OP_VSLDOI12 }; if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); int ShufIdxs[16]; switch (OpNum) { default: llvm_unreachable("Unknown i32 permute!"); case OP_VMRGHW: ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; break; case OP_VMRGLW: ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; break; case OP_VSPLTISW0: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+0; break; case OP_VSPLTISW1: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+4; break; case OP_VSPLTISW2: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+8; break; case OP_VSPLTISW3: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+12; break; case OP_VSLDOI4: return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); case OP_VSLDOI8: return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); case OP_VSLDOI12: return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); } EVT VT = OpLHS.getValueType(); OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); return DAG.getNode(ISD::BITCAST, dl, VT, T); } /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this /// is a shuffle we can handle in a single instruction, return it. Otherwise, /// return the code it can be lowered into. Worst case, it can always be /// lowered into a vperm. SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); ShuffleVectorSDNode *SVOp = cast(Op); EVT VT = Op.getValueType(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned ShiftElts, InsertAtByte; bool Swap; if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { if (Swap) std::swap(V1, V2); SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2); if (ShiftElts) { SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, DAG.getConstant(SplatIdx, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat); } // Left shifts of 8 bytes are actually swaps. Convert accordingly. if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); } } if (Subtarget.hasQPX()) { if (VT.getVectorNumElements() != 4) return SDValue(); if (V2.isUndef()) V2 = V1; int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); if (AlignIdx != -1) { return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, DAG.getConstant(AlignIdx, dl, MVT::i32)); } else if (SVOp->isSplat()) { int SplatIdx = SVOp->getSplatIndex(); if (SplatIdx >= 4) { std::swap(V1, V2); SplatIdx -= 4; } return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, DAG.getConstant(SplatIdx, dl, MVT::i32)); } // Lower this into a qvgpci/qvfperm pair. // Compute the qvgpci literal unsigned idx = 0; for (unsigned i = 0; i < 4; ++i) { int m = SVOp->getMaskElt(i); unsigned mm = m >= 0 ? (unsigned) m : i; idx |= mm << (3-i)*3; } SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, DAG.getConstant(idx, dl, MVT::i32)); return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); } // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. if (V2.isUndef()) { if (PPC::isSplatShuffleMask(SVOp, 1) || PPC::isSplatShuffleMask(SVOp, 2) || PPC::isSplatShuffleMask(SVOp, 4) || PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || (Subtarget.hasP8Altivec() && ( PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { return Op; } } // Altivec has a variety of "shuffle immediates" that take two vector inputs // and produce a fixed permutation. If any of these match, do not lower to // VPERM. unsigned int ShuffleKind = isLittleEndian ? 2 : 0; if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || (Subtarget.hasP8Altivec() && ( PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) return Op; // Check to see if this is a shuffle of 4-byte values. If so, we can use our // perfect shuffle table to emit an optimal matching sequence. ArrayRef PermMask = SVOp->getMask(); unsigned PFIndexes[4]; bool isFourElementShuffle = true; for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number unsigned EltNo = 8; // Start out undef. for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. if (PermMask[i*4+j] < 0) continue; // Undef, ignore it. unsigned ByteSource = PermMask[i*4+j]; if ((ByteSource & 3) != j) { isFourElementShuffle = false; break; } if (EltNo == 8) { EltNo = ByteSource/4; } else if (EltNo != ByteSource/4) { isFourElementShuffle = false; break; } } PFIndexes[i] = EltNo; } // If this shuffle can be expressed as a shuffle of 4-byte elements, use the // perfect shuffle vector to determine if it is cost effective to do this as // discrete instructions, or whether we should use a vperm. // For now, we skip this for little endian until such time as we have a // little-endian perfect shuffle table. if (isFourElementShuffle && !isLittleEndian) { // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); // Determining when to avoid vperm is tricky. Many things affect the cost // of vperm, particularly how many times the perm mask needs to be computed. // For example, if the perm mask can be hoisted out of a loop or is already // used (perhaps because there are multiple permutes with the same shuffle // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of // the loop requires an extra register. // // As a compromise, we only emit discrete instructions if the shuffle can be // generated in 3 or fewer operations. When we have loop information // available, if this block is within a loop, we should avoid using vperm // for 3-operation perms and use a constant pool load instead. if (Cost < 3) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant // vector that will get spilled to the constant pool. if (V2.isUndef()) V2 = V1; // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except // that it is in input element units, not in bytes. Convert now. // For little endian, the order of the input vectors is reversed, and // the permutation mask is complemented with respect to 31. This is // necessary to produce proper semantics with the big-endian-biased vperm // instruction. EVT EltVT = V1.getValueType().getVectorElementType(); unsigned BytesPerElement = EltVT.getSizeInBits()/8; SmallVector ResultMask; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; for (unsigned j = 0; j != BytesPerElement; ++j) if (isLittleEndian) ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), dl, MVT::i32)); else ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, MVT::i32)); } SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); if (isLittleEndian) return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V2, V1, VPermMask); else return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask); } /// getVectorCompareInfo - Given an intrinsic, return false if it is not a /// vector comparison. If it is, return true and fill in Opc/isDot with /// information about the intrinsic. static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = cast(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; isDot = false; switch (IntrinsicID) { default: return false; // Comparison predicates. case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 199; isDot = 1; } else return false; break; case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 967; isDot = 1; } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtud_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 711; isDot = 1; } else return false; break; // VSX predicate comparisons use the same infrastructure case Intrinsic::ppc_vsx_xvcmpeqdp_p: case Intrinsic::ppc_vsx_xvcmpgedp_p: case Intrinsic::ppc_vsx_xvcmpgtdp_p: case Intrinsic::ppc_vsx_xvcmpeqsp_p: case Intrinsic::ppc_vsx_xvcmpgesp_p: case Intrinsic::ppc_vsx_xvcmpgtsp_p: if (Subtarget.hasVSX()) { switch (IntrinsicID) { case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; } isDot = 1; } else return false; break; // Normal Comparisons. case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequd: if (Subtarget.hasP8Altivec()) { CompareOpc = 199; isDot = 0; } else return false; break; case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsd: if (Subtarget.hasP8Altivec()) { CompareOpc = 967; isDot = 0; } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtud: if (Subtarget.hasP8Altivec()) { CompareOpc = 711; isDot = 0; } else return false; break; } return true; } /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom /// lower, do it, otherwise return null. SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. bool is64bit = Subtarget.isPPC64(); return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, is64bit ? MVT::i64 : MVT::i32); } // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. SDLoc dl(Op); int CompareOpc; bool isDot; if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) return SDValue(); // Don't custom lower most intrinsics. // If this is a non-dot comparison, make the VCMP node and we are done. if (!isDot) { SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), Op.getOperand(1), Op.getOperand(2), DAG.getConstant(CompareOpc, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); } // Create the PPCISD altivec 'dot' comparison node. SDValue Ops[] = { Op.getOperand(2), // LHS Op.getOperand(3), // RHS DAG.getConstant(CompareOpc, dl, MVT::i32) }; EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); // Now that we have the comparison, emit a copy from the CR to a GPR. // This is flagged to the above dot comparison. SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, DAG.getRegister(PPC::CR6, MVT::i32), CompNode.getValue(1)); // Unpack the result based on how the target uses it. unsigned BitNo; // Bit # of CR6. bool InvertBit; // Invert result? switch (cast(Op.getOperand(1))->getZExtValue()) { default: // Can't happen, don't crash on invalid number though. case 0: // Return the value of the EQ bit of CR6. BitNo = 0; InvertBit = false; break; case 1: // Return the inverted value of the EQ bit of CR6. BitNo = 0; InvertBit = true; break; case 2: // Return the value of the LT bit of CR6. BitNo = 2; InvertBit = false; break; case 3: // Return the inverted value of the LT bit of CR6. BitNo = 2; InvertBit = true; break; } // Shift the bit into the low position. Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); // Isolate the bit. Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, DAG.getConstant(1, dl, MVT::i32)); // If we are supposed to, toggle the bit. if (InvertBit) Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, DAG.getConstant(1, dl, MVT::i32)); return Flags; } SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int // instructions), but for smaller types, we need to first extend up to v2i32 // before doing going farther. if (Op.getValueType() == MVT::v2i64) { EVT ExtVT = cast(Op.getOperand(1))->getVT(); if (ExtVT != MVT::v2i32) { Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), ExtVT.getVectorElementType(), 4))); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, DAG.getValueType(MVT::v2i32)); } return Op; } return SDValue(); } SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Create a stack slot that is 16-byte aligned. MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); // Store the input value into Value#0 of the stack slot. SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, MachinePointerInfo()); // Load it out. return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo()); } SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDNode *N = Op.getNode(); assert(N->getOperand(0).getValueType() == MVT::v4i1 && "Unknown extract_vector_elt type"); SDValue Value = N->getOperand(0); // The first part of this is like the store lowering except that we don't // need to track the chain. // The values are now known to be -1 (false) or 1 (true). To convert this // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to // understand how to form the extending load. SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), Value); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue StoreChain = DAG.getEntryNode(); SDValue Ops[] = {StoreChain, DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), Value, FIdx}; SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, VTs, Ops, MVT::v4i32, PtrInfo); // Extract the value requested. unsigned Offset = 4*cast(N->getOperand(1))->getZExtValue(); SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); SDValue IntVal = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); if (!Subtarget.useCRBits()) return IntVal; return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); } /// Lowering for QPX v4i1 loads SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); LoadSDNode *LN = cast(Op.getNode()); SDValue LoadChain = LN->getChain(); SDValue BasePtr = LN->getBasePtr(); if (Op.getValueType() == MVT::v4f64 || Op.getValueType() == MVT::v4f32) { EVT MemVT = LN->getMemoryVT(); unsigned Alignment = LN->getAlignment(); // If this load is properly aligned, then it is legal. if (Alignment >= MemVT.getStoreSize()) return Op; EVT ScalarVT = Op.getValueType().getScalarType(), ScalarMemVT = MemVT.getScalarType(); unsigned Stride = ScalarMemVT.getStoreSize(); SDValue Vals[4], LoadChains[4]; for (unsigned Idx = 0; Idx < 4; ++Idx) { SDValue Load; if (ScalarVT != ScalarMemVT) Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, BasePtr, LN->getPointerInfo().getWithOffset(Idx * Stride), ScalarMemVT, MinAlign(Alignment, Idx * Stride), LN->getMemOperand()->getFlags(), LN->getAAInfo()); else Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, LN->getPointerInfo().getWithOffset(Idx * Stride), MinAlign(Alignment, Idx * Stride), LN->getMemOperand()->getFlags(), LN->getAAInfo()); if (Idx == 0 && LN->isIndexed()) { assert(LN->getAddressingMode() == ISD::PRE_INC && "Unknown addressing mode on vector load"); Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), LN->getAddressingMode()); } Vals[Idx] = Load; LoadChains[Idx] = Load.getValue(1); BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Stride, dl, BasePtr.getValueType())); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); if (LN->isIndexed()) { SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; return DAG.getMergeValues(RetOps, dl); } SDValue RetOps[] = { Value, TF }; return DAG.getMergeValues(RetOps, dl); } assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); // To lower v4i1 from a byte array, we load the byte elements of the // vector and then reuse the BUILD_VECTOR logic. SDValue VectElmts[4], VectElmtChains[4]; for (unsigned i = 0; i < 4; ++i) { SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); VectElmts[i] = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, LN->getPointerInfo().getWithOffset(i), MVT::i8, /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); VectElmtChains[i] = VectElmts[i].getValue(1); } LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); SDValue RVals[] = { Value, LoadChain }; return DAG.getMergeValues(RVals, dl); } /// Lowering for QPX v4i1 stores SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); StoreSDNode *SN = cast(Op.getNode()); SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); if (Value.getValueType() == MVT::v4f64 || Value.getValueType() == MVT::v4f32) { EVT MemVT = SN->getMemoryVT(); unsigned Alignment = SN->getAlignment(); // If this store is properly aligned, then it is legal. if (Alignment >= MemVT.getStoreSize()) return Op; EVT ScalarVT = Value.getValueType().getScalarType(), ScalarMemVT = MemVT.getScalarType(); unsigned Stride = ScalarMemVT.getStoreSize(); SDValue Stores[4]; for (unsigned Idx = 0; Idx < 4; ++Idx) { SDValue Ex = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); SDValue Store; if (ScalarVT != ScalarMemVT) Store = DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, SN->getPointerInfo().getWithOffset(Idx * Stride), ScalarMemVT, MinAlign(Alignment, Idx * Stride), SN->getMemOperand()->getFlags(), SN->getAAInfo()); else Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, SN->getPointerInfo().getWithOffset(Idx * Stride), MinAlign(Alignment, Idx * Stride), SN->getMemOperand()->getFlags(), SN->getAAInfo()); if (Idx == 0 && SN->isIndexed()) { assert(SN->getAddressingMode() == ISD::PRE_INC && "Unknown addressing mode on vector store"); Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), SN->getAddressingMode()); } BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Stride, dl, BasePtr.getValueType())); Stores[Idx] = Store; } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); if (SN->isIndexed()) { SDValue RetOps[] = { TF, Stores[0].getValue(1) }; return DAG.getMergeValues(RetOps, dl); } return TF; } assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); // The values are now known to be -1 (false) or 1 (true). To convert this // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to // understand how to form the extending load. SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), Value); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Ops[] = {StoreChain, DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), Value, FIdx}; SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, VTs, Ops, MVT::v4i32, PtrInfo); // Move data into the byte array. SDValue Loads[4], LoadChains[4]; for (unsigned i = 0; i < 4; ++i) { unsigned Offset = 4*i; SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); LoadChains[i] = Loads[i].getValue(1); } StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); SDValue Stores[4]; for (unsigned i = 0; i < 4; ++i) { SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); Stores[i] = DAG.getTruncStore( StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), SN->getAAInfo()); } StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); return StoreChain; } SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. SDValue RHSSwap = // = vrlw RHS, 16 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); // Shrinkify inputs to v8i16. LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); // Low parts multiplied together, generating 32-bit results (we ignore the // top parts). SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, LHS, RHS, DAG, dl, MVT::v4i32); SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); // Shift the high parts up 16 bits. HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, Neg16, DAG, dl); return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); } else if (Op.getValueType() == MVT::v8i16) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, LHS, RHS, Zero, DAG, dl); } else if (Op.getValueType() == MVT::v16i8) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); bool isLittleEndian = Subtarget.isLittleEndian(); // Multiply the even 8-bit parts, producing 16-bit sums. SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, LHS, RHS, DAG, dl, MVT::v8i16); EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); // Multiply the odd 8-bit parts, producing 16-bit sums. SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, LHS, RHS, DAG, dl, MVT::v8i16); OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); // Merge the results together. Because vmuleub and vmuloub are // instructions with a big-endian bias, we must reverse the // element numbering and reverse the meaning of "odd" and "even" // when generating little endian code. int Ops[16]; for (unsigned i = 0; i != 8; ++i) { if (isLittleEndian) { Ops[i*2 ] = 2*i; Ops[i*2+1] = 2*i+16; } else { Ops[i*2 ] = 2*i+1; Ops[i*2+1] = 2*i+1+16; } } if (isLittleEndian) return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); else return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); } else { llvm_unreachable("Unknown mul to lower!"); } } /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Wasn't expecting to be able to lower this!"); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op)); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); // Lower 64-bit shifts. case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); // Vector-related lowering. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); } } void PPCTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::READCYCLECOUNTER: { SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); Results.push_back(RTB); Results.push_back(RTB.getValue(1)); Results.push_back(RTB.getValue(2)); break; } case ISD::INTRINSIC_W_CHAIN: { if (cast(N->getOperand(1))->getZExtValue() != Intrinsic::ppc_is_decremented_ctr_nonzero) break; assert(N->getValueType(0) == MVT::i1 && "Unexpected result type for CTR decrement intrinsic"); EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), N->getValueType(0)); SDVTList VTs = DAG.getVTList(SVT, MVT::Other); SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), N->getOperand(1)); Results.push_back(NewInt); Results.push_back(NewInt.getValue(1)); break; } case ISD::VAARG: { if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) return; EVT VT = N->getValueType(0); if (VT == MVT::i64) { SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG); Results.push_back(NewNode); Results.push_back(NewNode.getValue(1)); } return; } case ISD::FP_ROUND_INREG: { assert(N->getValueType(0) == MVT::ppcf128); assert(N->getOperand(0).getValueType() == MVT::ppcf128); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, N->getOperand(0), DAG.getIntPtrConstant(0, dl)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, N->getOperand(0), DAG.getIntPtrConstant(1, dl)); // Add the two halves of the long double in round-to-zero mode. SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); // We know the low half is about to be thrown away, so just use something // convenient. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128, FPreg, FPreg)); return; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: // LowerFP_TO_INT() can only handle f32 and f64. if (N->getOperand(0).getValueType() == MVT::ppcf128) return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; } } //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *Func = Intrinsic::getDeclaration(M, Id); return Builder.CreateCall(Func, {}); } // The mappings for emitLeading/TrailingFence is taken from // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) return callIntrinsic(Builder, Intrinsic::ppc_lwsync); return nullptr; } Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const { if (IsLoad && isAcquireOrStronger(Ord)) return callIntrinsic(Builder, Intrinsic::ppc_lwsync); // FIXME: this is too conservative, a dependent branch + isync is enough. // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. return nullptr; } MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, unsigned AtomicSize, unsigned BinOpcode) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); auto LoadMnemonic = PPC::LDARX; auto StoreMnemonic = PPC::STDCX; switch (AtomicSize) { default: llvm_unreachable("Unexpected size of atomic entity"); case 1: LoadMnemonic = PPC::LBARX; StoreMnemonic = PPC::STBCX; assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); break; case 2: LoadMnemonic = PPC::LHARX; StoreMnemonic = PPC::STHCX; assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); break; case 4: LoadMnemonic = PPC::LWARX; StoreMnemonic = PPC::STWCX; break; case 8: LoadMnemonic = PPC::LDARX; StoreMnemonic = PPC::STDCX; break; } const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loopMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned TmpReg = (!BinOpcode) ? incr : RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loopMBB); // loopMBB: // l[wd]arx dest, ptr // add r0, dest, incr // st[wd]cx. r0, ptr // bne- loopMBB // fallthrough --> exitMBB BB = loopMBB; BuildMI(BB, dl, TII->get(LoadMnemonic), dest) .addReg(ptrA).addReg(ptrB); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(TmpReg).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; return BB; } MachineBasicBlock * PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, bool is8bit, // operation unsigned BinOpcode) const { // If we support part-word atomic mnemonics, just use them if (Subtarget.hasPartwordAtomics()) return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode); // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're // doing actual arithmetic on the addresses. bool is64bit = Subtarget.isPPC64(); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loopMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; unsigned PtrReg = RegInfo.createVirtualRegister(RC); unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); unsigned ShiftReg = RegInfo.createVirtualRegister(RC); unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); unsigned MaskReg = RegInfo.createVirtualRegister(RC); unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); unsigned Ptr1Reg; unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loopMBB); // The 4-byte load must be aligned, while a char or short may be // anywhere in the word. Hence all this nasty bookkeeping code. // add ptr1, ptrA, ptrB [copy if ptrA==0] // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] // xori shift, shift1, 24 [16] // rlwinm ptr, ptr1, 0, 0, 29 // slw incr2, incr, shift // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] // slw mask, mask2, shift // loopMBB: // lwarx tmpDest, ptr // add tmp, tmpDest, incr2 // andc tmp2, tmpDest, mask // and tmp3, tmp, mask // or tmp4, tmp3, tmp2 // stwcx. tmp4, ptr // bne- loopMBB // fallthrough --> exitMBB // srw dest, tmpDest, shift if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) .addReg(ptrA).addReg(ptrB); } else { Ptr1Reg = ptrB; } BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); if (is64bit) BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(61); else BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) .addReg(incr).addReg(ShiftReg); if (is8bit) BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); else { BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); } BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) .addReg(Mask2Reg).addReg(ShiftReg); BB = loopMBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) .addReg(ZeroReg).addReg(PtrReg); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) .addReg(Incr2Reg).addReg(TmpDestReg); BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) .addReg(TmpReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) .addReg(Tmp3Reg).addReg(Tmp2Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)) .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) .addReg(ShiftReg); return BB; } llvm::MachineBasicBlock * PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(RC->hasType(MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // SjLjSetup mainMBB // bl mainMBB // v_restore = 1 // b sinkMBB // // mainMBB: // buf[LabelOffset] = LR // v_main = 0 // // sinkMBB: // v = phi(main, restore) // MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // Note that the structure of the jmp_buf used here is not compatible // with that used by libc, and is not designed to be. Specifically, it // stores only those 'reserved' registers that LLVM does not otherwise // understand how to spill. Also, by convention, by the time this // intrinsic is called, Clang has already stored the frame address in the // first slot of the buffer and stack address in the third. Following the // X86 target code, we'll store the jump address in the second slot. We also // need to save the TOC pointer (R2) to handle jumps between shared // libraries, and that will be stored in the fourth slot. The thread // identifier (R13) is not affected. // thisMBB: const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t TOCOffset = 3 * PVT.getStoreSize(); const int64_t BPOffset = 4 * PVT.getStoreSize(); // Prepare IP either in reg. const TargetRegisterClass *PtrRC = getRegClassFor(PVT); unsigned LabelReg = MRI.createVirtualRegister(PtrRC); unsigned BufReg = MI.getOperand(1).getReg(); if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) .addImm(TOCOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); } // Naked functions never have a base pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned BaseReg; if (MF->getFunction()->hasFnAttribute(Attribute::Naked)) BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; else BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; MIB = BuildMI(*thisMBB, MI, DL, TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) .addReg(BaseReg) .addImm(BPOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) .addMBB(mainMBB); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); // mainMBB: // mainDstReg = 0 MIB = BuildMI(mainMBB, DL, TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); // Store IP if (Subtarget.isPPC64()) { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) .addReg(LabelReg) .addImm(LabelOffset) .addReg(BufReg); } else { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) .addReg(LabelReg) .addImm(LabelOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(PPC::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(restoreDstReg).addMBB(thisMBB); MI.eraseFromParent(); return sinkMBB; } MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; unsigned BP = (PVT == MVT::i64) ? PPC::X30 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29 : PPC::R30); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); const int64_t TOCOffset = 3 * PVT.getStoreSize(); const int64_t BPOffset = 4 * PVT.getStoreSize(); unsigned BufReg = MI.getOperand(0).getReg(); // Reload FP (the jumped-to function may not have had a // frame pointer, and if so, then its r31 will be restored // as necessary). if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) .addImm(0) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) .addImm(0) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) .addImm(LabelOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) .addImm(LabelOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) .addImm(SPOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) .addImm(SPOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload BP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) .addImm(BPOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) .addImm(BPOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload TOC if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) .addImm(TOCOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); } // Jump BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); MI.eraseFromParent(); return MBB; } MachineBasicBlock * PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { if (MI.getOpcode() == TargetOpcode::STACKMAP || MI.getOpcode() == TargetOpcode::PATCHPOINT) { if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && MI.getOpcode() == TargetOpcode::PATCHPOINT) { // Call lowering should have added an r2 operand to indicate a dependence // on the TOC base pointer value. It can't however, because there is no // way to mark the dependence as implicit there, and so the stackmap code // will confuse it with a regular operand. Instead, add the dependence // here. setUsesTOCBasePtr(*BB->getParent()); MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); } return emitPatchPoint(MI, BB); } if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { return emitEHSjLjSetJmp(MI, BB); } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { return emitEHSjLjLongJmp(MI, BB); } const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); MachineFunction *F = BB->getParent(); if (Subtarget.hasISEL() && (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) { SmallVector Cond; if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8) Cond.push_back(MI.getOperand(4)); else Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); Cond.push_back(MI.getOperand(1)); DebugLoc dl = MI.getDebugLoc(); TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_QFRC || MI.getOpcode() == PPC::SELECT_CC_QSRC || MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || MI.getOpcode() == PPC::SELECT_CC_VSRC || MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_QFRC || MI.getOpcode() == PPC::SELECT_QSRC || MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || MI.getOpcode() == PPC::SELECT_VSRC) { // The incoming instruction knows the destination vreg to set, the // condition code register to branch on, the true/false values to // select between, and a branch opcode to use. // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); DebugLoc dl = MI.getDebugLoc(); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Next, add the true and fallthrough blocks as its successors. BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_QFRC || MI.getOpcode() == PPC::SELECT_QSRC || MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || MI.getOpcode() == PPC::SELECT_VSRC) { BuildMI(BB, dl, TII->get(PPC::BC)) .addReg(MI.getOperand(1).getReg()) .addMBB(sinkMBB); } else { unsigned SelectPred = MI.getOperand(4).getImm(); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(SelectPred) .addReg(MI.getOperand(1).getReg()) .addMBB(sinkMBB); } // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB BB = copy0MBB; // Update machine-CFG edges BB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) .addReg(MI.getOperand(3).getReg()) .addMBB(copy0MBB) .addReg(MI.getOperand(2).getReg()) .addMBB(thisMBB); } else if (MI.getOpcode() == PPC::ReadTB) { // To read the 64-bit time-base register on a 32-bit target, we read the // two halves. Should the counter have wrapped while it was being read, we // need to try again. // ... // readLoop: // mfspr Rx,TBU # load from TBU // mfspr Ry,TB # load from TB // mfspr Rz,TBU # load from TBU // cmpw crX,Rx,Rz # check if 'old'='new' // bne readLoop # branch if they're not equal // ... MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); DebugLoc dl = MI.getDebugLoc(); F->insert(It, readMBB); F->insert(It, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(readMBB); BB = readMBB; MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); unsigned LoReg = MI.getOperand(0).getReg(); unsigned HiReg = MI.getOperand(1).getReg(); BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) .addReg(HiReg).addReg(ReadAgainReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); BB->addSuccessor(readMBB); BB->addSuccessor(sinkMBB); } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32) BB = EmitAtomicBinary(MI, BB, 4, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) BB = EmitAtomicBinary(MI, BB, 8, 0); else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || (Subtarget.hasPartwordAtomics() && MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || (Subtarget.hasPartwordAtomics() && MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; auto LoadMnemonic = PPC::LDARX; auto StoreMnemonic = PPC::STDCX; switch (MI.getOpcode()) { default: llvm_unreachable("Compare and swap of unknown size"); case PPC::ATOMIC_CMP_SWAP_I8: LoadMnemonic = PPC::LBARX; StoreMnemonic = PPC::STBCX; assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); break; case PPC::ATOMIC_CMP_SWAP_I16: LoadMnemonic = PPC::LHARX; StoreMnemonic = PPC::STHCX; assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); break; case PPC::ATOMIC_CMP_SWAP_I32: LoadMnemonic = PPC::LWARX; StoreMnemonic = PPC::STWCX; break; case PPC::ATOMIC_CMP_SWAP_I64: LoadMnemonic = PPC::LDARX; StoreMnemonic = PPC::STDCX; break; } unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned oldval = MI.getOperand(3).getReg(); unsigned newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loop1MBB); F->insert(It, loop2MBB); F->insert(It, midMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loop1MBB); // loop1MBB: // l[bhwd]arx dest, ptr // cmp[wd] dest, oldval // bne- midMBB // loop2MBB: // st[bhwd]cx. newval, ptr // bne- loopMBB // b exitBB // midMBB: // st[bhwd]cx. dest, ptr // exitBB: BB = loop1MBB; BuildMI(BB, dl, TII->get(LoadMnemonic), dest) .addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) .addReg(oldval).addReg(dest); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(midMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(newval).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); BB = midMBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(dest).addReg(ptrA).addReg(ptrB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { // We must use 64-bit registers for addresses when targeting 64-bit, // since we're actually doing arithmetic on them. Other registers // can be 32-bit. bool is64bit = Subtarget.isPPC64(); bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned oldval = MI.getOperand(3).getReg(); unsigned newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loop1MBB); F->insert(It, loop2MBB); F->insert(It, midMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; unsigned PtrReg = RegInfo.createVirtualRegister(RC); unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); unsigned ShiftReg = RegInfo.createVirtualRegister(RC); unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); unsigned MaskReg = RegInfo.createVirtualRegister(RC); unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); unsigned Ptr1Reg; unsigned TmpReg = RegInfo.createVirtualRegister(RC); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loop1MBB); // The 4-byte load must be aligned, while a char or short may be // anywhere in the word. Hence all this nasty bookkeeping code. // add ptr1, ptrA, ptrB [copy if ptrA==0] // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] // xori shift, shift1, 24 [16] // rlwinm ptr, ptr1, 0, 0, 29 // slw newval2, newval, shift // slw oldval2, oldval,shift // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] // slw mask, mask2, shift // and newval3, newval2, mask // and oldval3, oldval2, mask // loop1MBB: // lwarx tmpDest, ptr // and tmp, tmpDest, mask // cmpw tmp, oldval3 // bne- midMBB // loop2MBB: // andc tmp2, tmpDest, mask // or tmp4, tmp2, newval3 // stwcx. tmp4, ptr // bne- loop1MBB // b exitBB // midMBB: // stwcx. tmpDest, ptr // exitBB: // srw dest, tmpDest, shift if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) .addReg(ptrA).addReg(ptrB); } else { Ptr1Reg = ptrB; } BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); if (is64bit) BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(61); else BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) .addReg(newval).addReg(ShiftReg); BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) .addReg(oldval).addReg(ShiftReg); if (is8bit) BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); else { BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) .addReg(Mask3Reg).addImm(65535); } BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) .addReg(Mask2Reg).addReg(ShiftReg); BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) .addReg(NewVal2Reg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) .addReg(OldVal2Reg).addReg(MaskReg); BB = loop1MBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) .addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) .addReg(TmpReg).addReg(OldVal3Reg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(midMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) .addReg(Tmp2Reg).addReg(NewVal3Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) .addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); BB = midMBB; BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) .addReg(ZeroReg).addReg(PtrReg); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) .addReg(ShiftReg); } else if (MI.getOpcode() == PPC::FADDrtz) { // This pseudo performs an FADD with rounding mode temporarily forced // to round-to-zero. We emit this via custom inserter since the FPSCR // is not modeled at the SelectionDAG level. unsigned Dest = MI.getOperand(0).getReg(); unsigned Src1 = MI.getOperand(1).getReg(); unsigned Src2 = MI.getOperand(2).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); // Save FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); // Set rounding mode to round-to-zero. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); // Perform addition. BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); // Restore FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || MI.getOpcode() == PPC::ANDIo_1_GT_BIT || MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) ? PPC::ANDIo8 : PPC::ANDIo; bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); DebugLoc dl = MI.getDebugLoc(); BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) .addReg(MI.getOperand(1).getReg()) .addImm(1); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); } else if (MI.getOpcode() == PPC::TCHECK_RET) { DebugLoc Dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); return BB; } else { llvm_unreachable("Unexpected instr type to insert"); } MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } //===----------------------------------------------------------------------===// // Target Optimization Hooks //===----------------------------------------------------------------------===// static std::string getRecipOp(const char *Base, EVT VT) { std::string RecipOp(Base); if (VT.getScalarType() == MVT::f64) RecipOp += "d"; else RecipOp += "f"; if (VT.isVector()) RecipOp = "vec-" + RecipOp; return RecipOp; } SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || (VT == MVT::v4f64 && Subtarget.hasQPX())) { TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; std::string RecipOp = getRecipOp("sqrt", VT); if (!Recips.isEnabled(RecipOp)) return SDValue(); RefinementSteps = Recips.getRefinementSteps(RecipOp); UseOneConstNR = true; return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } return SDValue(); } SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || (VT == MVT::v4f64 && Subtarget.hasQPX())) { TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; std::string RecipOp = getRecipOp("div", VT); if (!Recips.isEnabled(RecipOp)) return SDValue(); RefinementSteps = Recips.getRefinementSteps(RecipOp); return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); } return SDValue(); } unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { // Note: This functionality is used only when unsafe-fp-math is enabled, and // on cores with reciprocal estimates (which are used when unsafe-fp-math is // enabled for division), this functionality is redundant with the default // combiner logic (once the division -> reciprocal/multiply transformation // has taken place). As a result, this matters more for older cores than for // newer ones. // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are two or more FDIVs (for embedded cores with only // one FP pipeline) for three or more FDIVs (for generic OOO cores). switch (Subtarget.getDarwinDirective()) { default: return 3; case PPC::DIR_440: case PPC::DIR_A2: case PPC::DIR_E500mc: case PPC::DIR_E5500: return 2; } } // isConsecutiveLSLoc needs to work even if all adds have not yet been // collapsed, and so we need to look through chains of them. static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, int64_t& Offset, SelectionDAG &DAG) { if (DAG.isBaseWithConstantOffset(Loc)) { Base = Loc.getOperand(0); Offset += cast(Loc.getOperand(1))->getSExtValue(); // The base might itself be a base plus an offset, and if so, accumulate // that as well. getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); } } static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { if (VT.getSizeInBits() / 8 != Bytes) return false; SDValue BaseLoc = Base->getBasePtr(); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) return false; const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); int FI = cast(Loc)->getIndex(); int BFI = cast(BaseLoc)->getIndex(); int FS = MFI->getObjectSize(FI); int BFS = MFI->getObjectSize(BFI); if (FS != BFS || FS != (int)Bytes) return false; return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } SDValue Base1 = Loc, Base2 = BaseLoc; int64_t Offset1 = 0, Offset2 = 0; getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) return true; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; Offset1 = 0; Offset2 = 0; bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); if (isGA1 && isGA2 && GV1 == GV2) return Offset1 == (Offset2 + Dist*Bytes); return false; } // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does // not enforce equality of the chain operands. static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { if (LSBaseSDNode *LS = dyn_cast(N)) { EVT VT = LS->getMemoryVT(); SDValue Loc = LS->getBasePtr(); return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); } if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; case Intrinsic::ppc_qpx_qvlfd: case Intrinsic::ppc_qpx_qvlfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvlfs: case Intrinsic::ppc_qpx_qvlfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvlfcd: case Intrinsic::ppc_qpx_qvlfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfcs: case Intrinsic::ppc_qpx_qvlfcsa: VT = MVT::v2f32; break; case Intrinsic::ppc_qpx_qvlfiwa: case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: VT = MVT::v4i32; break; case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_altivec_lvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_lvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_lvewx: VT = MVT::i32; break; } return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); } if (N->getOpcode() == ISD::INTRINSIC_VOID) { EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; case Intrinsic::ppc_qpx_qvstfd: case Intrinsic::ppc_qpx_qvstfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvstfs: case Intrinsic::ppc_qpx_qvstfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvstfcd: case Intrinsic::ppc_qpx_qvstfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfcs: case Intrinsic::ppc_qpx_qvstfcsa: VT = MVT::v2f32; break; case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: VT = MVT::v4i32; break; case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_altivec_stvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_stvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_stvewx: VT = MVT::i32; break; } return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); } return false; } // Return true is there is a nearyby consecutive load to the one provided // (regardless of alignment). We search up and down the chain, looking though // token factors and other loads (but nothing else). As a result, a true result // indicates that it is safe to create a new consecutive load adjacent to the // load provided. static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); SmallSet LoadRoots; SmallVector Queue(1, Chain.getNode()); SmallSet Visited; // First, search up the chain, branching to follow all token-factor operands. // If we find a consecutive load, then we're done, otherwise, record all // nodes just above the top-level loads and token factors. while (!Queue.empty()) { SDNode *ChainNext = Queue.pop_back_val(); if (!Visited.insert(ChainNext).second) continue; if (MemSDNode *ChainLD = dyn_cast(ChainNext)) { if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; if (!Visited.count(ChainLD->getChain().getNode())) Queue.push_back(ChainLD->getChain().getNode()); } else if (ChainNext->getOpcode() == ISD::TokenFactor) { for (const SDUse &O : ChainNext->ops()) if (!Visited.count(O.getNode())) Queue.push_back(O.getNode()); } else LoadRoots.insert(ChainNext); } // Second, search down the chain, starting from the top-level nodes recorded // in the first phase. These top-level nodes are the nodes just above all // loads and token factors. Starting with their uses, recursively look though // all loads (just the chain uses) and token factors to find a consecutive // load. Visited.clear(); Queue.clear(); for (SmallSet::iterator I = LoadRoots.begin(), IE = LoadRoots.end(); I != IE; ++I) { Queue.push_back(*I); while (!Queue.empty()) { SDNode *LoadRoot = Queue.pop_back_val(); if (!Visited.insert(LoadRoot).second) continue; if (MemSDNode *ChainLD = dyn_cast(LoadRoot)) if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; for (SDNode::use_iterator UI = LoadRoot->use_begin(), UE = LoadRoot->use_end(); UI != UE; ++UI) if (((isa(*UI) && cast(*UI)->getChain().getNode() == LoadRoot) || UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) Queue.push_back(*UI); } } return false; } SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); // If we're tracking CR bits, we need to be careful that we don't have: // trunc(binary-ops(zext(x), zext(y))) // or // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) // such that we're unnecessarily moving things into GPRs when it would be // better to keep them in CR bits. // Note that trunc here can be an actual i1 trunc, or can be the effective // truncation that comes from a setcc or select_cc. if (N->getOpcode() == ISD::TRUNCATE && N->getValueType(0) != MVT::i1) return SDValue(); if (N->getOperand(0).getValueType() != MVT::i32 && N->getOperand(0).getValueType() != MVT::i64) return SDValue(); if (N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) { // If we're looking at a comparison, then we need to make sure that the // high bits (all except for the first) don't matter the result. ISD::CondCode CC = cast(N->getOperand( N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); unsigned OpBits = N->getOperand(0).getValueSizeInBits(); if (ISD::isSignedIntSetCC(CC)) { if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) return SDValue(); } else if (ISD::isUnsignedIntSetCC(CC)) { if (!DAG.MaskedValueIsZero(N->getOperand(0), APInt::getHighBitsSet(OpBits, OpBits-1)) || !DAG.MaskedValueIsZero(N->getOperand(1), APInt::getHighBitsSet(OpBits, OpBits-1))) return SDValue(); } else { // This is neither a signed nor an unsigned comparison, just make sure // that the high bits are equal. APInt Op1Zero, Op1One; APInt Op2Zero, Op2One; DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One); DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One); // We don't really care about what is known about the first bit (if // anything), so clear it in all masks prior to comparing them. Op1Zero.clearBit(0); Op1One.clearBit(0); Op2Zero.clearBit(0); Op2One.clearBit(0); if (Op1Zero != Op2Zero || Op1One != Op2One) return SDValue(); } } // We now know that the higher-order bits are irrelevant, we just need to // make sure that all of the intermediate operations are bit operations, and // all inputs are extensions. if (N->getOperand(0).getOpcode() != ISD::AND && N->getOperand(0).getOpcode() != ISD::OR && N->getOperand(0).getOpcode() != ISD::XOR && N->getOperand(0).getOpcode() != ISD::SELECT && N->getOperand(0).getOpcode() != ISD::SELECT_CC && N->getOperand(0).getOpcode() != ISD::TRUNCATE && N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) return SDValue(); if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && N->getOperand(1).getOpcode() != ISD::AND && N->getOperand(1).getOpcode() != ISD::OR && N->getOperand(1).getOpcode() != ISD::XOR && N->getOperand(1).getOpcode() != ISD::SELECT && N->getOperand(1).getOpcode() != ISD::SELECT_CC && N->getOperand(1).getOpcode() != ISD::TRUNCATE && N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) return SDValue(); SmallVector Inputs; SmallVector BinOps, PromOps; SmallPtrSet Visited; for (unsigned i = 0; i < 2; ++i) { if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || isa(N->getOperand(i))) Inputs.push_back(N->getOperand(i)); else BinOps.push_back(N->getOperand(i)); if (N->getOpcode() == ISD::TRUNCATE) break; } // Visit all inputs, collect all binary operations (and, or, xor and // select) that are all fed by extensions. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); if (!Visited.insert(BinOp.getNode()).second) continue; PromOps.push_back(BinOp); for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { // The condition of the select is not promoted. if (BinOp.getOpcode() == ISD::SELECT && i == 0) continue; if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) continue; if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || isa(BinOp.getOperand(i))) { Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || BinOp.getOperand(i).getOpcode() == ISD::SELECT || BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { BinOps.push_back(BinOp.getOperand(i)); } else { // We have an input that is not an extension or another binary // operation; we'll abort this transformation. return SDValue(); } } } // Make sure that this is a self-contained cluster of operations (which // is not quite the same thing as saying that everything has only one // use). for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { if (isa(Inputs[i])) continue; for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), UE = Inputs[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // Make sure that we're not going to promote the non-output-value // operand(s) or SELECT or SELECT_CC. // FIXME: Although we could sometimes handle this, and it does occur in // practice that one of the condition inputs to the select is also one of // the outputs, we currently can't deal with this. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == Inputs[i]) return SDValue(); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == Inputs[i] || User->getOperand(1) == Inputs[i]) return SDValue(); } } } for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), UE = PromOps[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // Make sure that we're not going to promote the non-output-value // operand(s) or SELECT or SELECT_CC. // FIXME: Although we could sometimes handle this, and it does occur in // practice that one of the condition inputs to the select is also one of // the outputs, we currently can't deal with this. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == PromOps[i]) return SDValue(); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == PromOps[i] || User->getOperand(1) == PromOps[i]) return SDValue(); } } } // Replace all inputs with the extension operand. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { // Constants may have users outside the cluster of to-be-promoted nodes, // and so we need to replace those as we do the promotions. if (isa(Inputs[i])) continue; else DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); } std::list PromOpHandles; for (auto &PromOp : PromOps) PromOpHandles.emplace_back(PromOp); // Replace all operations (these are all the same, but have a different // (i1) return type). DAG.getNode will validate that the types of // a binary operator match, so go through the list in reverse so that // we've likely promoted both operands first. Any intermediate truncations or // extensions disappear. while (!PromOpHandles.empty()) { SDValue PromOp = PromOpHandles.back().getValue(); PromOpHandles.pop_back(); if (PromOp.getOpcode() == ISD::TRUNCATE || PromOp.getOpcode() == ISD::SIGN_EXTEND || PromOp.getOpcode() == ISD::ZERO_EXTEND || PromOp.getOpcode() == ISD::ANY_EXTEND) { if (!isa(PromOp.getOperand(0)) && PromOp.getOperand(0).getValueType() != MVT::i1) { // The operand is not yet ready (see comment below). PromOpHandles.emplace_front(PromOp); continue; } SDValue RepValue = PromOp.getOperand(0); if (isa(RepValue)) RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); continue; } unsigned C; switch (PromOp.getOpcode()) { default: C = 0; break; case ISD::SELECT: C = 1; break; case ISD::SELECT_CC: C = 2; break; } if ((!isa(PromOp.getOperand(C)) && PromOp.getOperand(C).getValueType() != MVT::i1) || (!isa(PromOp.getOperand(C+1)) && PromOp.getOperand(C+1).getValueType() != MVT::i1)) { // The to-be-promoted operands of this node have not yet been // promoted (this should be rare because we're going through the // list backward, but if one of the operands has several users in // this cluster of to-be-promoted nodes, it is possible). PromOpHandles.emplace_front(PromOp); continue; } SmallVector Ops(PromOp.getNode()->op_begin(), PromOp.getNode()->op_end()); // If there are any constant inputs, make sure they're replaced now. for (unsigned i = 0; i < 2; ++i) if (isa(Ops[C+i])) Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); DAG.ReplaceAllUsesOfValueWith(PromOp, DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); } // Now we're left with the initial truncation itself. if (N->getOpcode() == ISD::TRUNCATE) return N->getOperand(0); // Otherwise, this is a comparison. The operands to be compared have just // changed type (to i1), but everything else is the same. return SDValue(N, 0); } SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); // If we're tracking CR bits, we need to be careful that we don't have: // zext(binary-ops(trunc(x), trunc(y))) // or // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) // such that we're unnecessarily moving things into CR bits that can more // efficiently stay in GPRs. Note that if we're not certain that the high // bits are set as required by the final extension, we still may need to do // some masking to get the proper behavior. // This same functionality is important on PPC64 when dealing with // 32-to-64-bit extensions; these occur often when 32-bit values are used as // the return values of functions. Because it is so similar, it is handled // here as well. if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64) return SDValue(); if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::AND && N->getOperand(0).getOpcode() != ISD::OR && N->getOperand(0).getOpcode() != ISD::XOR && N->getOperand(0).getOpcode() != ISD::SELECT && N->getOperand(0).getOpcode() != ISD::SELECT_CC) return SDValue(); SmallVector Inputs; SmallVector BinOps(1, N->getOperand(0)), PromOps; SmallPtrSet Visited; // Visit all inputs, collect all binary operations (and, or, xor and // select) that are all fed by truncations. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); if (!Visited.insert(BinOp.getNode()).second) continue; PromOps.push_back(BinOp); for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { // The condition of the select is not promoted. if (BinOp.getOpcode() == ISD::SELECT && i == 0) continue; if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) continue; if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || isa(BinOp.getOperand(i))) { Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || BinOp.getOperand(i).getOpcode() == ISD::SELECT || BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { BinOps.push_back(BinOp.getOperand(i)); } else { // We have an input that is not a truncation or another binary // operation; we'll abort this transformation. return SDValue(); } } } // The operands of a select that must be truncated when the select is // promoted because the operand is actually part of the to-be-promoted set. DenseMap SelectTruncOp[2]; // Make sure that this is a self-contained cluster of operations (which // is not quite the same thing as saying that everything has only one // use). for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { if (isa(Inputs[i])) continue; for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), UE = Inputs[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // If we're going to promote the non-output-value operand(s) or SELECT or // SELECT_CC, record them for truncation. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == Inputs[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == Inputs[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); if (User->getOperand(1) == Inputs[i]) SelectTruncOp[1].insert(std::make_pair(User, User->getOperand(1).getValueType())); } } } for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), UE = PromOps[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // If we're going to promote the non-output-value operand(s) or SELECT or // SELECT_CC, record them for truncation. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == PromOps[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == PromOps[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); if (User->getOperand(1) == PromOps[i]) SelectTruncOp[1].insert(std::make_pair(User, User->getOperand(1).getValueType())); } } } unsigned PromBits = N->getOperand(0).getValueSizeInBits(); bool ReallyNeedsExt = false; if (N->getOpcode() != ISD::ANY_EXTEND) { // If all of the inputs are not already sign/zero extended, then // we'll still need to do that at the end. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { if (isa(Inputs[i])) continue; unsigned OpBits = Inputs[i].getOperand(0).getValueSizeInBits(); assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); if ((N->getOpcode() == ISD::ZERO_EXTEND && !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), APInt::getHighBitsSet(OpBits, OpBits-PromBits))) || (N->getOpcode() == ISD::SIGN_EXTEND && DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < (OpBits-(PromBits-1)))) { ReallyNeedsExt = true; break; } } } // Replace all inputs, either with the truncation operand, or a // truncation or extension to the final output type. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { // Constant inputs need to be replaced with the to-be-promoted nodes that // use them because they might have users outside of the cluster of // promoted nodes. if (isa(Inputs[i])) continue; SDValue InSrc = Inputs[i].getOperand(0); if (Inputs[i].getValueType() == N->getValueType(0)) DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); else if (N->getOpcode() == ISD::SIGN_EXTEND) DAG.ReplaceAllUsesOfValueWith(Inputs[i], DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); else if (N->getOpcode() == ISD::ZERO_EXTEND) DAG.ReplaceAllUsesOfValueWith(Inputs[i], DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); else DAG.ReplaceAllUsesOfValueWith(Inputs[i], DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); } std::list PromOpHandles; for (auto &PromOp : PromOps) PromOpHandles.emplace_back(PromOp); // Replace all operations (these are all the same, but have a different // (promoted) return type). DAG.getNode will validate that the types of // a binary operator match, so go through the list in reverse so that // we've likely promoted both operands first. while (!PromOpHandles.empty()) { SDValue PromOp = PromOpHandles.back().getValue(); PromOpHandles.pop_back(); unsigned C; switch (PromOp.getOpcode()) { default: C = 0; break; case ISD::SELECT: C = 1; break; case ISD::SELECT_CC: C = 2; break; } if ((!isa(PromOp.getOperand(C)) && PromOp.getOperand(C).getValueType() != N->getValueType(0)) || (!isa(PromOp.getOperand(C+1)) && PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { // The to-be-promoted operands of this node have not yet been // promoted (this should be rare because we're going through the // list backward, but if one of the operands has several users in // this cluster of to-be-promoted nodes, it is possible). PromOpHandles.emplace_front(PromOp); continue; } // For SELECT and SELECT_CC nodes, we do a similar check for any // to-be-promoted comparison inputs. if (PromOp.getOpcode() == ISD::SELECT || PromOp.getOpcode() == ISD::SELECT_CC) { if ((SelectTruncOp[0].count(PromOp.getNode()) && PromOp.getOperand(0).getValueType() != N->getValueType(0)) || (SelectTruncOp[1].count(PromOp.getNode()) && PromOp.getOperand(1).getValueType() != N->getValueType(0))) { PromOpHandles.emplace_front(PromOp); continue; } } SmallVector Ops(PromOp.getNode()->op_begin(), PromOp.getNode()->op_end()); // If this node has constant inputs, then they'll need to be promoted here. for (unsigned i = 0; i < 2; ++i) { if (!isa(Ops[C+i])) continue; if (Ops[C+i].getValueType() == N->getValueType(0)) continue; if (N->getOpcode() == ISD::SIGN_EXTEND) Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); else if (N->getOpcode() == ISD::ZERO_EXTEND) Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); else Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); } // If we've promoted the comparison inputs of a SELECT or SELECT_CC, // truncate them again to the original value type. if (PromOp.getOpcode() == ISD::SELECT || PromOp.getOpcode() == ISD::SELECT_CC) { auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); if (SI0 != SelectTruncOp[0].end()) Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); if (SI1 != SelectTruncOp[1].end()) Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); } DAG.ReplaceAllUsesOfValueWith(PromOp, DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); } // Now we're left with the initial extension itself. if (!ReallyNeedsExt) return N->getOperand(0); // To zero extend, just mask off everything except for the first bit (in the // i1 case). if (N->getOpcode() == ISD::ZERO_EXTEND) return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), DAG.getConstant(APInt::getLowBitsSet( N->getValueSizeInBits(0), PromBits), dl, N->getValueType(0))); assert(N->getOpcode() == ISD::SIGN_EXTEND && "Invalid extension type"); EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); SDValue ShiftCst = DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); return DAG.getNode( ISD::SRA, dl, N->getValueType(0), DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), ShiftCst); } SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && "Should be called with a BUILD_VECTOR node"); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); if (N->getValueType(0) != MVT::v2f64 || !Subtarget.hasVSX()) return SDValue(); // Looking for: // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) if (N->getOperand(0).getOpcode() != ISD::SINT_TO_FP && N->getOperand(0).getOpcode() != ISD::UINT_TO_FP) return SDValue(); if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) return SDValue(); if (N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode()) return SDValue(); SDValue Ext1 = N->getOperand(0).getOperand(0); SDValue Ext2 = N->getOperand(1).getOperand(0); if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); ConstantSDNode *Ext1Op = dyn_cast(Ext1.getOperand(1)); ConstantSDNode *Ext2Op = dyn_cast(Ext2.getOperand(1)); if (!Ext1Op || !Ext2Op) return SDValue(); if (Ext1.getValueType() != MVT::i32 || Ext2.getValueType() != MVT::i32) if (Ext1.getOperand(0) != Ext2.getOperand(0)) return SDValue(); int FirstElem = Ext1Op->getZExtValue(); int SecondElem = Ext2Op->getZExtValue(); int SubvecIdx; if (FirstElem == 0 && SecondElem == 1) SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; else if (FirstElem == 2 && SecondElem == 3) SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; else return SDValue(); SDValue SrcVec = Ext1.getOperand(0); auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP; return DAG.getNode(NodeType, dl, MVT::v2f64, SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); } SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const { assert((N->getOpcode() == ISD::SINT_TO_FP || N->getOpcode() == ISD::UINT_TO_FP) && "Need an int -> FP conversion node here"); if (!Subtarget.has64BitSupport()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Op(N, 0); // Don't handle ppc_fp128 here or i1 conversions. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); if (Op.getOperand(0).getValueType() == MVT::i1) return SDValue(); // For i32 intermediate values, unfortunately, the conversion functions // leave the upper 32 bits of the value are undefined. Within the set of // scalar instructions, we have no method for zero- or sign-extending the // value. Thus, we cannot handle i32 intermediate values here. if (Op.getOperand(0).getValueType() == MVT::i32) return SDValue(); assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS : PPCISD::FCFIDS) : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU : PPCISD::FCFID); MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? MVT::f32 : MVT::f64; // If we're converting from a float, to an int, and back to a float again, // then we don't need the store/load pair at all. if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && Subtarget.hasFPCVT()) || (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { SDValue Src = Op.getOperand(0).getOperand(0); if (Src.getValueType() == MVT::f32) { Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); DCI.AddToWorklist(Src.getNode()); } else if (Src.getValueType() != MVT::f64) { // Make sure that we don't pick up a ppc_fp128 source value. return SDValue(); } unsigned FCTOp = Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ; SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); DCI.AddToWorklist(FP.getNode()); } return FP; } return SDValue(); } // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for // builtins) into loads with swaps. SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Chain; SDValue Base; MachineMemOperand *MMO; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode for little endian VSX load"); case ISD::LOAD: { LoadSDNode *LD = cast(N); Chain = LD->getChain(); Base = LD->getBasePtr(); MMO = LD->getMemOperand(); // If the MMO suggests this isn't a load of a full vector, leave // things alone. For a built-in, we have to make the change for // correctness, so if there is a size problem that will be a bug. if (MMO->getSize() < 16) return SDValue(); break; } case ISD::INTRINSIC_W_CHAIN: { MemIntrinsicSDNode *Intrin = cast(N); Chain = Intrin->getChain(); // Similarly to the store case below, Intrin->getBasePtr() doesn't get // us what we want. Get operand 2 instead. Base = Intrin->getOperand(2); MMO = Intrin->getMemOperand(); break; } } MVT VecTy = N->getValueType(0).getSimpleVT(); SDValue LoadOps[] = { Chain, Base }; SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, DAG.getVTList(MVT::v2f64, MVT::Other), LoadOps, MVT::v2f64, MMO); DCI.AddToWorklist(Load.getNode()); Chain = Load.getValue(1); SDValue Swap = DAG.getNode( PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load); DCI.AddToWorklist(Swap.getNode()); // Add a bitcast if the resulting load type doesn't match v2f64. if (VecTy != MVT::v2f64) { SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap); DCI.AddToWorklist(N.getNode()); // Package {bitcast value, swap's chain} to match Load's shape. return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other), N, Swap.getValue(1)); } return Swap; } // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for // builtins) into stores with swaps. SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Chain; SDValue Base; unsigned SrcOpnd; MachineMemOperand *MMO; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode for little endian VSX store"); case ISD::STORE: { StoreSDNode *ST = cast(N); Chain = ST->getChain(); Base = ST->getBasePtr(); MMO = ST->getMemOperand(); SrcOpnd = 1; // If the MMO suggests this isn't a store of a full vector, leave // things alone. For a built-in, we have to make the change for // correctness, so if there is a size problem that will be a bug. if (MMO->getSize() < 16) return SDValue(); break; } case ISD::INTRINSIC_VOID: { MemIntrinsicSDNode *Intrin = cast(N); Chain = Intrin->getChain(); // Intrin->getBasePtr() oddly does not get what we want. Base = Intrin->getOperand(3); MMO = Intrin->getMemOperand(); SrcOpnd = 2; break; } } SDValue Src = N->getOperand(SrcOpnd); MVT VecTy = Src.getValueType().getSimpleVT(); // All stores are done as v2f64 and possible bit cast. if (VecTy != MVT::v2f64) { Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); DCI.AddToWorklist(Src.getNode()); } SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src); DCI.AddToWorklist(Swap.getNode()); Chain = Swap.getValue(1); SDValue StoreOps[] = { Chain, Swap, Base }; SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, DAG.getVTList(MVT::Other), StoreOps, VecTy, MMO); DCI.AddToWorklist(Store.getNode()); return Store; } SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); switch (N->getOpcode()) { default: break; case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); break; case PPCISD::SRL: if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. return N->getOperand(0); break; case PPCISD::SRA: if (ConstantSDNode *C = dyn_cast(N->getOperand(0))) { if (C->isNullValue() || // 0 >>s V -> 0. C->isAllOnesValue()) // -1 >>s V -> -1. return N->getOperand(0); } break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return DAGCombineExtBoolTrunc(N, DCI); case ISD::TRUNCATE: case ISD::SETCC: case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return combineFPToIntToFP(N, DCI); case ISD::STORE: { // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). if (Subtarget.hasSTFIWX() && !cast(N)->isTruncatingStore() && N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && N->getOperand(1).getValueType() == MVT::i32 && N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) { SDValue Val = N->getOperand(1).getOperand(0); if (Val.getValueType() == MVT::f32) { Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); DCI.AddToWorklist(Val.getNode()); } Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val); DCI.AddToWorklist(Val.getNode()); SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2), DAG.getValueType(N->getOperand(1).getValueType()) }; Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, DAG.getVTList(MVT::Other), Ops, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); DCI.AddToWorklist(Val.getNode()); return Val; } // Turn STORE (BSWAP) -> sthbrx/stwbrx. if (cast(N)->isUnindexed() && N->getOperand(1).getOpcode() == ISD::BSWAP && N->getOperand(1).getNode()->hasOneUse() && (N->getOperand(1).getValueType() == MVT::i32 || N->getOperand(1).getValueType() == MVT::i16 || (Subtarget.hasLDBRX() && Subtarget.isPPC64() && N->getOperand(1).getValueType() == MVT::i64))) { SDValue BSwapOp = N->getOperand(1).getOperand(0); // Do an any-extend to 32-bits if this is a half-word input. if (BSwapOp.getValueType() == MVT::i16) BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); SDValue Ops[] = { N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(N->getOperand(1).getValueType()) }; return DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), Ops, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); } // For little endian, VSX stores require generating xxswapd/lxvd2x. EVT VT = N->getOperand(1).getValueType(); if (VT.isSimple()) { MVT StoreVT = VT.getSimpleVT(); if (Subtarget.hasVSX() && Subtarget.isLittleEndian() && (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) return expandVSXStoreForLE(N, DCI); } break; } case ISD::LOAD: { LoadSDNode *LD = cast(N); EVT VT = LD->getValueType(0); // For little endian, VSX loads require generating lxvd2x/xxswapd. if (VT.isSimple()) { MVT LoadVT = VT.getSimpleVT(); if (Subtarget.hasVSX() && Subtarget.isLittleEndian() && (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) return expandVSXLoadForLE(N, DCI); } // We sometimes end up with a 64-bit integer load, from which we extract // two single-precision floating-point numbers. This happens with // std::complex, and other similar structures, because of the way we // canonicalize structure copies. However, if we lack direct moves, // then the final bitcasts from the extracted integer values to the // floating-point numbers turn into store/load pairs. Even with direct moves, // just loading the two floating-point numbers is likely better. auto ReplaceTwoFloatLoad = [&]() { if (VT != MVT::i64) return false; if (LD->getExtensionType() != ISD::NON_EXTLOAD || LD->isVolatile()) return false; // We're looking for a sequence like this: // t13: i64,ch = load t0, t6, undef:i64 // t16: i64 = srl t13, Constant:i32<32> // t17: i32 = truncate t16 // t18: f32 = bitcast t17 // t19: i32 = truncate t13 // t20: f32 = bitcast t19 if (!LD->hasNUsesOfValue(2, 0)) return false; auto UI = LD->use_begin(); while (UI.getUse().getResNo() != 0) ++UI; SDNode *Trunc = *UI++; while (UI.getUse().getResNo() != 0) ++UI; SDNode *RightShift = *UI; if (Trunc->getOpcode() != ISD::TRUNCATE) std::swap(Trunc, RightShift); if (Trunc->getOpcode() != ISD::TRUNCATE || Trunc->getValueType(0) != MVT::i32 || !Trunc->hasOneUse()) return false; if (RightShift->getOpcode() != ISD::SRL || !isa(RightShift->getOperand(1)) || RightShift->getConstantOperandVal(1) != 32 || !RightShift->hasOneUse()) return false; SDNode *Trunc2 = *RightShift->use_begin(); if (Trunc2->getOpcode() != ISD::TRUNCATE || Trunc2->getValueType(0) != MVT::i32 || !Trunc2->hasOneUse()) return false; SDNode *Bitcast = *Trunc->use_begin(); SDNode *Bitcast2 = *Trunc2->use_begin(); if (Bitcast->getOpcode() != ISD::BITCAST || Bitcast->getValueType(0) != MVT::f32) return false; if (Bitcast2->getOpcode() != ISD::BITCAST || Bitcast2->getValueType(0) != MVT::f32) return false; if (Subtarget.isLittleEndian()) std::swap(Bitcast, Bitcast2); // Bitcast has the second float (in memory-layout order) and Bitcast2 // has the first one. SDValue BasePtr = LD->getBasePtr(); if (LD->isIndexed()) { assert(LD->getAddressingMode() == ISD::PRE_INC && "Non-pre-inc AM on PPC?"); BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, LD->getOffset()); } auto MMOFlags = LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->getAlignment(), MMOFlags, LD->getAAInfo()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getIntPtrConstant(4, dl)); SDValue FloatLoad2 = DAG.getLoad( MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, LD->getPointerInfo().getWithOffset(4), MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); if (LD->isIndexed()) { // Note that DAGCombine should re-form any pre-increment load(s) from // what is produced here if that makes sense. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); } DCI.CombineTo(Bitcast2, FloatLoad); DCI.CombineTo(Bitcast, FloatLoad2); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), SDValue(FloatLoad2.getNode(), 1)); return true; }; if (ReplaceTwoFloatLoad()) return SDValue(N, 0); EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v4f32)) || (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && LD->getAlignment() >= ScalarABIAlignment)) && LD->getAlignment() < ABIAlignment) { // This is a type-legal unaligned Altivec or QPX load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); // This implements the loading of unaligned vectors as described in // the venerable Apple Velocity Engine overview. Specifically: // https://developer.apple.com/hardwaredrivers/ve/alignment.html // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html // // The general idea is to expand a sequence of one or more unaligned // loads into an alignment-based permutation-control instruction (lvsl // or lvsr), a series of regular vector loads (which always truncate // their input address to an aligned address), and a series of // permutations. The results of these permutations are the requested // loaded values. The trick is that the last "extra" load is not taken // from the address you might suspect (sizeof(vector) bytes after the // last requested load), but rather sizeof(vector) - 1 bytes after the // last requested vector. The point of this is to avoid a page fault if // the base address happened to be aligned. This works because if the // base address is aligned, then adding less than a full vector length // will cause the last vector in the sequence to be (re)loaded. // Otherwise, the next vector will be fetched as you might suspect was // necessary. // We might be able to reuse the permutation generation from // a different base address offset from this one by an aligned amount. // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; if (Subtarget.hasAltivec()) { Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl; IntrLD = Intrinsic::ppc_altivec_lvx; IntrPerm = Intrinsic::ppc_altivec_vperm; PermCntlTy = MVT::v16i8; PermTy = MVT::v4i32; LDTy = MVT::v4i32; } else { Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : Intrinsic::ppc_qpx_qvlpcls; IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : Intrinsic::ppc_qpx_qvlfs; IntrPerm = Intrinsic::ppc_qpx_qvfperm; PermCntlTy = MVT::v4f64; PermTy = MVT::v4f64; LDTy = MemVT.getSimpleVT(); } SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); // Create the new MMO for the new base load. It is like the original MMO, // but represents an area in memory almost twice the vector size centered // on the original address. If the address is unaligned, we might start // reading up to (sizeof(vector)-1) bytes below the address of the // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMMO = MF.getMachineMemOperand(LD->getMemOperand(), -(long)MemVT.getStoreSize()+1, 2*MemVT.getStoreSize()-1); // Create the new base load. SDValue LDXIntID = DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue BaseLoad = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, DAG.getVTList(PermTy, MVT::Other), BaseLoadOps, LDTy, BaseMMO); // Note that the value of IncOffset (which is provided to the next // load's pointer info offset value, and thus used to calculate the // alignment), and the value of IncValue (which is actually used to // increment the pointer value) are different! This is because we // require the next load to appear to be aligned, even though it // is actually offset from the base pointer by a lesser amount. int IncOffset = VT.getSizeInBits() / 8; int IncValue = IncOffset; // Walk (both up and down) the chain looking for another load at the real // (aligned) offset (the alignment of the other load does not matter in // this case). If found, then do not use the offset reduction trick, as // that will prevent the loads from being later combined (as they would // otherwise be duplicates). if (!findConsecutiveLoad(LD, DAG)) --IncValue; SDValue Increment = DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); MachineMemOperand *ExtraMMO = MF.getMachineMemOperand(LD->getMemOperand(), 1, 2*MemVT.getStoreSize()-1); SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue ExtraLoad = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, DAG.getVTList(PermTy, MVT::Other), ExtraLoadOps, LDTy, ExtraMMO); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, BaseLoad.getValue(1), ExtraLoad.getValue(1)); // Because vperm has a big-endian bias, we must reverse the order // of the input vectors and complement the permute control vector // when generating little endian code. We have already handled the // latter by using lvsr instead of lvsl, so just reverse BaseLoad // and ExtraLoad here. SDValue Perm; if (isLittleEndian) Perm = BuildIntrinsicOp(IntrPerm, ExtraLoad, BaseLoad, PermCntl, DAG, dl); else Perm = BuildIntrinsicOp(IntrPerm, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) Perm = Subtarget.hasAltivec() ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. // The output of the permutation is our loaded result, the TokenFactor is // our new chain. DCI.CombineTo(N, Perm, TF); return SDValue(N, 0); } } break; case ISD::INTRINSIC_WO_CHAIN: { bool isLittleEndian = Subtarget.isLittleEndian(); unsigned IID = cast(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); if ((IID == Intr || IID == Intrinsic::ppc_qpx_qvlpcld || IID == Intrinsic::ppc_qpx_qvlpcls) && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero( Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) .zext( Add.getValueType().getScalarType().getSizeInBits()))) { SDNode *BasePtr = Add->getOperand(0).getNode(); for (SDNode::use_iterator UI = BasePtr->use_begin(), UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast(UI->getOperand(0))->getZExtValue() == IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. return SDValue(*UI, 0); } } } if (isa(Add->getOperand(1))) { SDNode *BasePtr = Add->getOperand(0).getNode(); for (SDNode::use_iterator UI = BasePtr->use_begin(), UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::ADD && isa(UI->getOperand(1)) && (cast(Add->getOperand(1))->getZExtValue() - cast(UI->getOperand(1))->getZExtValue()) % (1ULL << Bits) == 0) { SDNode *OtherAdd = *UI; for (SDNode::use_iterator VI = OtherAdd->use_begin(), VE = OtherAdd->use_end(); VI != VE; ++VI) { if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast(VI->getOperand(0))->getZExtValue() == IID) { return SDValue(*VI, 0); } } } } } } } break; case ISD::INTRINSIC_W_CHAIN: { // For little endian, VSX loads require generating lxvd2x/xxswapd. if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) { switch (cast(N->getOperand(1))->getZExtValue()) { default: break; case Intrinsic::ppc_vsx_lxvw4x: case Intrinsic::ppc_vsx_lxvd2x: return expandVSXLoadForLE(N, DCI); } } break; } case ISD::INTRINSIC_VOID: { // For little endian, VSX stores require generating xxswapd/stxvd2x. if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) { switch (cast(N->getOperand(1))->getZExtValue()) { default: break; case Intrinsic::ppc_vsx_stxvw4x: case Intrinsic::ppc_vsx_stxvd2x: return expandVSXStoreForLE(N, DCI); } } break; } case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && N->getOperand(0).hasOneUse() && (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || (Subtarget.hasLDBRX() && Subtarget.isPPC64() && N->getValueType(0) == MVT::i64))) { SDValue Load = N->getOperand(0); LoadSDNode *LD = cast(Load); // Create the byte-swapping load. SDValue Ops[] = { LD->getChain(), // Chain LD->getBasePtr(), // Ptr DAG.getValueType(N->getValueType(0)) // VT }; SDValue BSLoad = DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, DAG.getVTList(N->getValueType(0) == MVT::i64 ? MVT::i64 : MVT::i32, MVT::Other), Ops, LD->getMemoryVT(), LD->getMemOperand()); // If this is an i16 load, insert the truncate. SDValue ResVal = BSLoad; if (N->getValueType(0) == MVT::i16) ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); // First, combine the bswap away. This makes the value produced by the // load dead. DCI.CombineTo(N, ResVal); // Next, combine the load away, we give it a bogus result value but a real // chain result. The result value is dead because the bswap is dead. DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); // Return N so it doesn't get rechecked! return SDValue(N, 0); } break; case PPCISD::VCMP: { // If a VCMPo node already exists with exactly the same operands as this // node, use its result instead of this node (VCMPo computes both a CR6 and // a normal output). // if (!N->getOperand(0).hasOneUse() && !N->getOperand(1).hasOneUse() && !N->getOperand(2).hasOneUse()) { // Scan all of the users of the LHS, looking for VCMPo's that match. SDNode *VCMPoNode = nullptr; SDNode *LHSN = N->getOperand(0).getNode(); for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); UI != E; ++UI) if (UI->getOpcode() == PPCISD::VCMPo && UI->getOperand(1) == N->getOperand(1) && UI->getOperand(2) == N->getOperand(2) && UI->getOperand(0) == N->getOperand(0)) { VCMPoNode = *UI; break; } // If there is no VCMPo node, or if the flag value has a single use, don't // transform this. if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) break; // Look at the (necessarily single) use of the flag value. If it has a // chain, this transformation is more complex. Note that multiple things // could use the value result, which we should ignore. SDNode *FlagUser = nullptr; for (SDNode::use_iterator UI = VCMPoNode->use_begin(); FlagUser == nullptr; ++UI) { assert(UI != VCMPoNode->use_end() && "Didn't find user!"); SDNode *User = *UI; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { FlagUser = User; break; } } } // If the user is a MFOCRF instruction, we know this is safe. // Otherwise we give up for right now. if (FlagUser->getOpcode() == PPCISD::MFOCRF) return SDValue(VCMPoNode, 0); } break; } case ISD::BRCOND: { SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(Cond.getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero) { // We now need to make the intrinsic dead (it cannot be instruction // selected). DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); assert(Cond.getNode()->hasOneUse() && "Counter decrement has more than one use"); return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, N->getOperand(0), Target); } } break; case ISD::BR_CC: { // If this is a branch on an altivec predicate comparison, lower this so // that we don't have to do a MFOCRF: instead, branch directly on CR6. This // lowering is done pre-legalize, because the legalizer lowers the predicate // compare down to code that is difficult to reassemble. ISD::CondCode CC = cast(N->getOperand(1))->get(); SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); // Sometimes the promoted value of the intrinsic is ANDed by some non-zero // value. If so, pass-through the AND to get to the intrinsic. if (LHS.getOpcode() == ISD::AND && LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(0).getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero && isa(LHS.getOperand(1)) && !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero && isa(RHS)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Counter decrement comparison is not EQ or NE"); unsigned Val = cast(RHS)->getZExtValue(); bool isBDNZ = (CC == ISD::SETEQ && Val) || (CC == ISD::SETNE && !Val); // We now need to make the intrinsic dead (it cannot be instruction // selected). DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); assert(LHS.getNode()->hasOneUse() && "Counter decrement has more than one use"); return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, N->getOperand(0), N->getOperand(4)); } int CompareOpc; bool isDot; if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && isa(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { assert(isDot && "Can't compare against a vector result!"); // If this is a comparison against something other than 0/1, then we know // that the condition is never/always true. unsigned Val = cast(RHS)->getZExtValue(); if (Val != 0 && Val != 1) { if (CC == ISD::SETEQ) // Cond never true, remove branch. return N->getOperand(0); // Always !=, turn it into an unconditional branch. return DAG.getNode(ISD::BR, dl, MVT::Other, N->getOperand(0), N->getOperand(4)); } bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); // Create the PPCISD altivec 'dot' comparison node. SDValue Ops[] = { LHS.getOperand(2), // LHS of compare LHS.getOperand(3), // RHS of compare DAG.getConstant(CompareOpc, dl, MVT::i32) }; EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); // Unpack the result based on how the target uses it. PPC::Predicate CompOpc; switch (cast(LHS.getOperand(1))->getZExtValue()) { default: // Can't happen, don't crash on invalid number though. case 0: // Branch on the value of the EQ bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; break; case 1: // Branch on the inverted value of the EQ bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; break; case 2: // Branch on the value of the LT bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; break; case 3: // Branch on the inverted value of the LT bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; break; } return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), DAG.getConstant(CompOpc, dl, MVT::i32), DAG.getRegister(PPC::CR6, MVT::i32), N->getOperand(4), CompNode.getValue(1)); } break; } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); } return SDValue(); } SDValue PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const { // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if (VT == MVT::i64 && !Subtarget.isPPC64()) return SDValue(); if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); bool IsNegPow2 = (-Divisor).isPowerOf2(); unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); if (Created) Created->push_back(Op.getNode()); if (IsNegPow2) { Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); if (Created) Created->push_back(Op.getNode()); } return Op; } //===----------------------------------------------------------------------===// // Inline Assembly Support //===----------------------------------------------------------------------===// void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0); switch (Op.getOpcode()) { default: break; case PPCISD::LBRX: { // lhbrx is known to have the top bits cleared out. if (cast(Op.getOperand(2))->getVT() == MVT::i16) KnownZero = 0xFFFF0000; break; } case ISD::INTRINSIC_WO_CHAIN: { switch (cast(Op.getOperand(0))->getZExtValue()) { default: break; case Intrinsic::ppc_altivec_vcmpbfp_p: case Intrinsic::ppc_altivec_vcmpeqfp_p: case Intrinsic::ppc_altivec_vcmpequb_p: case Intrinsic::ppc_altivec_vcmpequh_p: case Intrinsic::ppc_altivec_vcmpequw_p: case Intrinsic::ppc_altivec_vcmpequd_p: case Intrinsic::ppc_altivec_vcmpgefp_p: case Intrinsic::ppc_altivec_vcmpgtfp_p: case Intrinsic::ppc_altivec_vcmpgtsb_p: case Intrinsic::ppc_altivec_vcmpgtsh_p: case Intrinsic::ppc_altivec_vcmpgtsw_p: case Intrinsic::ppc_altivec_vcmpgtsd_p: case Intrinsic::ppc_altivec_vcmpgtub_p: case Intrinsic::ppc_altivec_vcmpgtuh_p: case Intrinsic::ppc_altivec_vcmpgtuw_p: case Intrinsic::ppc_altivec_vcmpgtud_p: KnownZero = ~1U; // All bits but the low one are known to be zero. break; } } } } unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { switch (Subtarget.getDarwinDirective()) { default: break; case PPC::DIR_970: case PPC::DIR_PWR4: case PPC::DIR_PWR5: case PPC::DIR_PWR5X: case PPC::DIR_PWR6: case PPC::DIR_PWR6X: case PPC::DIR_PWR7: case PPC::DIR_PWR8: case PPC::DIR_PWR9: { if (!ML) break; const PPCInstrInfo *TII = Subtarget.getInstrInfo(); // For small loops (between 5 and 8 instructions), align to a 32-byte // boundary so that the entire loop fits in one instruction-cache line. uint64_t LoopSize = 0; for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { LoopSize += TII->GetInstSizeInBytes(*J); if (LoopSize > 32) break; } if (LoopSize > 16 && LoopSize <= 32) return 5; break; } } return TargetLowering::getPrefLoopAlignment(ML); } /// getConstraintType - Given a constraint, return the type of /// constraint it is for this target. PPCTargetLowering::ConstraintType PPCTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'b': case 'r': case 'f': case 'd': case 'v': case 'y': return C_RegisterClass; case 'Z': // FIXME: While Z does indicate a memory constraint, it specifically // indicates an r+r address (used in conjunction with the 'y' modifier // in the replacement string). Currently, we're forcing the base // register to be r0 in the asm printer (which is interpreted as zero) // and forming the complete address in the second register. This is // suboptimal. return C_Memory; } } else if (Constraint == "wc") { // individual CR bits. return C_RegisterClass; } else if (Constraint == "wa" || Constraint == "wd" || Constraint == "wf" || Constraint == "ws") { return C_RegisterClass; // VSX registers. } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight PPCTargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) return CW_Register; // an individual CR bit. else if ((StringRef(constraint) == "wa" || StringRef(constraint) == "wd" || StringRef(constraint) == "wf") && type->isVectorTy()) return CW_Register; else if (StringRef(constraint) == "ws" && type->isDoubleTy()) return CW_Register; switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'b': if (type->isIntegerTy()) weight = CW_Register; break; case 'f': if (type->isFloatTy()) weight = CW_Register; break; case 'd': if (type->isDoubleTy()) weight = CW_Register; break; case 'v': if (type->isVectorTy()) weight = CW_Register; break; case 'y': weight = CW_Register; break; case 'Z': weight = CW_Memory; break; } return weight; } std::pair PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC RS6000 Constraint Letters switch (Constraint[0]) { case 'b': // R1-R31 if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); case 'r': // R0-R31 if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RCRegClass); return std::make_pair(0U, &PPC::GPRCRegClass); // 'd' and 'f' constraints are both defined to be "the floating point // registers", where one is for 32-bit and the other for 64-bit. We don't // really care overly much here so just give them all the same reg classes. case 'd': case 'f': if (VT == MVT::f32 || VT == MVT::i32) return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); if (VT == MVT::v4f64 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QFRCRegClass); if (VT == MVT::v4f32 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QSRCRegClass); break; case 'v': if (VT == MVT::v4f64 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QFRCRegClass); if (VT == MVT::v4f32 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); } } else if (Constraint == "wc" && Subtarget.useCRBits()) { // An individual CR bit. return std::make_pair(0U, &PPC::CRBITRCRegClass); } else if ((Constraint == "wa" || Constraint == "wd" || Constraint == "wf") && Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); } else if (Constraint == "ws" && Subtarget.hasVSX()) { if (VT == MVT::f32 && Subtarget.hasP8Vector()) return std::make_pair(0U, &PPC::VSSRCRegClass); else return std::make_pair(0U, &PPC::VSFRCRegClass); } std::pair R = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers // (which we call X[0-9]+). If a 64-bit value has been requested, and a // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent // register. // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use // the AsmName field from *RegisterInfo.td, then this would not be necessary. if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { R.first = PPC::CR0; R.second = &PPC::CRRCRegClass; } return R; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints. if (Constraint.length() > 1) return; char Letter = Constraint[0]; switch (Letter) { default: break; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': { ConstantSDNode *CST = dyn_cast(Op); if (!CST) return; // Must be an immediate to match. SDLoc dl(Op); int64_t Value = CST->getSExtValue(); EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative // numbers are printed as such. switch (Letter) { default: llvm_unreachable("Unknown constraint letter!"); case 'I': // "I" is a signed 16-bit constant. if (isInt<16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'J': // "J" is a constant with only the high-order 16 bits nonzero. if (isShiftedUInt<16, 16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. if (isShiftedInt<16, 16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'K': // "K" is a constant with only the low-order 16 bits nonzero. if (isUInt<16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'M': // "M" is a constant that is greater than 31. if (Value > 31) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'N': // "N" is a positive constant that is an exact power of two. if (Value > 0 && isPowerOf2_64(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'O': // "O" is the constant zero. if (Value == 0) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'P': // "P" is a constant whose negation is a signed 16-bit constant. if (isInt<16>(-Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; } break; } } if (Result.getNode()) { Ops.push_back(Result); return; } // Handle standard constraint letters. TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // PPC does not allow r+i addressing modes for vectors! if (Ty->isVectorTy() && AM.BaseOffs != 0) return false; // PPC allows a sign-extended 16-bit immediate field. if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) return false; // No global is ever allowed as a base. if (AM.BaseGV) return false; // PPC only support r+r, switch (AM.Scale) { case 0: // "r+i" or just "i", depending on HasBaseReg. break; case 1: if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. return false; // Otherwise we have r+r or r+i. break; case 2: if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. return false; // Allow 2*r as r+r. break; default: // No other scales are supported. return false; } return true; } SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); // Make sure the function does not optimize away the store of the RA to // the stack. PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setLRStoreRequired(); bool isPPC64 = Subtarget.isPPC64(); auto PtrVT = getPointerTy(MF.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, isPPC64 ? MVT::i64 : MVT::i32); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo()); } // Just load the return address off the stack. SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo()); } SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setFrameAddressIsTaken(true); EVT PtrVT = getPointerTy(MF.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; // Naked functions never have a frame pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned FrameReg; if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) FrameReg = isPPC64 ? PPC::X1 : PPC::R1; else FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); while (Depth--) FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { bool isPPC64 = Subtarget.isPPC64(); bool isDarwinABI = Subtarget.isDarwinABI(); if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || (!isPPC64 && VT != MVT::i32)) report_fatal_error("Invalid register global variable type"); bool is64Bit = isPPC64 && VT == MVT::i64; unsigned Reg = StringSwitch(RegName) .Case("r1", is64Bit ? PPC::X1 : PPC::R1) .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : (is64Bit ? PPC::X13 : PPC::R13)) .Default(0); if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } bool PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The PowerPC target isn't yet aware of offsets. return false; } bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfd: case Intrinsic::ppc_qpx_qvlfs: case Intrinsic::ppc_qpx_qvlfcd: case Intrinsic::ppc_qpx_qvlfcs: case Intrinsic::ppc_qpx_qvlfiwa: case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: case Intrinsic::ppc_altivec_lvehx: case Intrinsic::ppc_altivec_lvewx: case Intrinsic::ppc_vsx_lxvd2x: case Intrinsic::ppc_vsx_lxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_lvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_lvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_lvewx: VT = MVT::i32; break; case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfd: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvlfs: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvlfcd: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfcs: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = VT; Info.ptrVal = I.getArgOperand(0); Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; Info.vol = false; Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::ppc_qpx_qvlfda: case Intrinsic::ppc_qpx_qvlfsa: case Intrinsic::ppc_qpx_qvlfcda: case Intrinsic::ppc_qpx_qvlfcsa: case Intrinsic::ppc_qpx_qvlfiwaa: case Intrinsic::ppc_qpx_qvlfiwza: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvlfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvlfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfcsa: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = VT; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; Info.vol = false; Info.readMem = true; Info.writeMem = false; return true; } case Intrinsic::ppc_qpx_qvstfd: case Intrinsic::ppc_qpx_qvstfs: case Intrinsic::ppc_qpx_qvstfcd: case Intrinsic::ppc_qpx_qvstfcs: case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: case Intrinsic::ppc_altivec_stvehx: case Intrinsic::ppc_altivec_stvewx: case Intrinsic::ppc_vsx_stxvd2x: case Intrinsic::ppc_vsx_stxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_stvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_stvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_stvewx: VT = MVT::i32; break; case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfd: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvstfs: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvstfcd: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfcs: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_VOID; Info.memVT = VT; Info.ptrVal = I.getArgOperand(1); Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; Info.vol = false; Info.readMem = false; Info.writeMem = true; return true; } case Intrinsic::ppc_qpx_qvstfda: case Intrinsic::ppc_qpx_qvstfsa: case Intrinsic::ppc_qpx_qvstfcda: case Intrinsic::ppc_qpx_qvstfcsa: case Intrinsic::ppc_qpx_qvstfiwa: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_qpx_qvstfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvstfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvstfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfcsa: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_VOID; Info.memVT = VT; Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; Info.vol = false; Info.readMem = false; Info.writeMem = true; return true; } default: break; } return false; } /// getOptimalMemOpType - Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { const Function *F = MF.getFunction(); // When expanding a memset, require at least two QPX instructions to cover // the cost of loading the value to be stored from the constant pool. if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { return MVT::v4f64; } // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Size >= 16 && (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) return MVT::v4i32; } if (Subtarget.isPPC64()) { return MVT::i64; } return MVT::i32; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); return !(BitSize == 0 || BitSize > 64); } bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 == 64 && NumBits2 == 32; } bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 == 64 && NumBits2 == 32; } bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // Generally speaking, zexts are not free, but they are free when they can be // folded with other operations. if (LoadSDNode *LD = dyn_cast(Val)) { EVT MemVT = LD->getMemoryVT(); if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || (Subtarget.isPPC64() && MemVT == MVT::i32)) && (LD->getExtensionType() == ISD::NON_EXTLOAD || LD->getExtensionType() == ISD::ZEXTLOAD)) return true; } // FIXME: Add other cases... // - 32-bit shifts with a zext to i64 // - zext after ctlz, bswap, etc. // - zext after and by a constant mask return TargetLowering::isZExtFree(Val, VT2); } bool PPCTargetLowering::isFPExtFree(EVT VT) const { assert(VT.isFloatingPoint()); return true; } bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { if (DisablePPCUnaligned) return false; // PowerPC supports unaligned memory access for simple non-vector types. // Although accessing unaligned addresses is not as efficient as accessing // aligned addresses, it is generally more efficient than manual expansion, // and generally only traps for software emulation when crossing page // boundaries. if (!VT.isSimple()) return false; if (VT.getSimpleVT().isVector()) { if (Subtarget.hasVSX()) { if (VT != MVT::v2f64 && VT != MVT::v2i64 && VT != MVT::v4f32 && VT != MVT::v4i32) return false; } else { return false; } } if (VT == MVT::ppcf128) return false; if (Fast) *Fast = true; return true; } bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } const MCPhysReg * PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call // site. Hence we include LR in the scratch registers, which are in turn added // as implicit-defs for stackmaps and patchpoints. The same reasoning applies // to CTR, which is used by any indirect call. static const MCPhysReg ScratchRegs[] = { PPC::X12, PPC::LR8, PPC::CTR8, 0 }; return ScratchRegs; } unsigned PPCTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; } unsigned PPCTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; } bool PPCTargetLowering::shouldExpandBuildVectorWithShuffles( EVT VT , unsigned DefinedValues) const { if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves if (Subtarget.hasVSX() || Subtarget.hasQPX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); } Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { if (DisableILPPref || Subtarget.enableMachineScheduler()) return TargetLowering::getSchedulingPreference(N); return Sched::ILP; } // Create a fast isel object. FastISel * PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const { return PPC::createFastISel(FuncInfo, LibInfo); } void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { if (Subtarget.isDarwinABI()) return; if (!Subtarget.isPPC64()) return; // Update IsSplitCSR in PPCFunctionInfo PPCFunctionInfo *PFI = Entry->getParent()->getInfo(); PFI->setIsSplitCSR(true); } void PPCTargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (PPC::G8RCRegClass.contains(*I)) RC = &PPC::G8RCRegClass; else if (PPC::F8RCRegClass.contains(*I)) RC = &PPC::F8RCRegClass; else if (PPC::CRRCRegClass.contains(*I)) RC = &PPC::CRRCRegClass; else if (PPC::VRRCRegClass.contains(*I)) RC = &PPC::VRRCRegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } // Override to enable LOAD_STACK_GUARD lowering on Linux. bool PPCTargetLowering::useLoadStackGuardNode() const { if (!Subtarget.isTargetLinux()) return TargetLowering::useLoadStackGuardNode(); return true; } // Override to disable global variable loading on Linux. void PPCTargetLowering::insertSSPDeclarations(Module &M) const { if (!Subtarget.isTargetLinux()) return TargetLowering::insertSSPDeclarations(M); } Index: projects/clang390-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp =================================================================== --- projects/clang390-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 304769) +++ projects/clang390-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 304770) @@ -1,2270 +1,2260 @@ //===- Reassociate.cpp - Reassociate binary expressions -------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass reassociates commutative expressions in an order that is designed // to promote better constant propagation, GCSE, LICM, PRE, etc. // // For example: 4 + (x + 5) -> x + (4 + 5) // // In the implementation of this algorithm, constants are assigned rank = 0, // function arguments are rank = 1, and other values are assigned ranks // corresponding to the reverse post order traversal of current function // (starting at 2), which effectively gives values in deep loops higher rank // than values not in loops. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/Reassociate.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include using namespace llvm; using namespace reassociate; #define DEBUG_TYPE "reassociate" STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); #ifndef NDEBUG /// Print out the expression identified in the Ops list. /// static void PrintOps(Instruction *I, const SmallVectorImpl &Ops) { Module *M = I->getModule(); dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " << *Ops[0].Op->getType() << '\t'; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { dbgs() << "[ "; Ops[i].Op->printAsOperand(dbgs(), false, M); dbgs() << ", #" << Ops[i].Rank << "] "; } } #endif /// Utility class representing a non-constant Xor-operand. We classify /// non-constant Xor-Operands into two categories: /// C1) The operand is in the form "X & C", where C is a constant and C != ~0 /// C2) /// C2.1) The operand is in the form of "X | C", where C is a non-zero /// constant. /// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this /// operand as "E | 0" class llvm::reassociate::XorOpnd { public: XorOpnd(Value *V); bool isInvalid() const { return SymbolicPart == nullptr; } bool isOrExpr() const { return isOr; } Value *getValue() const { return OrigVal; } Value *getSymbolicPart() const { return SymbolicPart; } unsigned getSymbolicRank() const { return SymbolicRank; } const APInt &getConstPart() const { return ConstPart; } void Invalidate() { SymbolicPart = OrigVal = nullptr; } void setSymbolicRank(unsigned R) { SymbolicRank = R; } private: Value *OrigVal; Value *SymbolicPart; APInt ConstPart; unsigned SymbolicRank; bool isOr; }; XorOpnd::XorOpnd(Value *V) { assert(!isa(V) && "No ConstantInt"); OrigVal = V; Instruction *I = dyn_cast(V); SymbolicRank = 0; if (I && (I->getOpcode() == Instruction::Or || I->getOpcode() == Instruction::And)) { Value *V0 = I->getOperand(0); Value *V1 = I->getOperand(1); if (isa(V0)) std::swap(V0, V1); if (ConstantInt *C = dyn_cast(V1)) { ConstPart = C->getValue(); SymbolicPart = V0; isOr = (I->getOpcode() == Instruction::Or); return; } } // view the operand as "V | 0" SymbolicPart = V; ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth()); isOr = true; } /// Return true if V is an instruction of the specified opcode and if it /// only has one use. static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { if (V->hasOneUse() && isa(V) && cast(V)->getOpcode() == Opcode && (!isa(V) || cast(V)->hasUnsafeAlgebra())) return cast(V); return nullptr; } static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, unsigned Opcode2) { if (V->hasOneUse() && isa(V) && (cast(V)->getOpcode() == Opcode1 || cast(V)->getOpcode() == Opcode2) && (!isa(V) || cast(V)->hasUnsafeAlgebra())) return cast(V); return nullptr; } -void ReassociatePass::BuildRankMap( - Function &F, ReversePostOrderTraversal &RPOT) { +void ReassociatePass::BuildRankMap(Function &F) { unsigned i = 2; // Assign distinct ranks to function arguments. for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { ValueRankMap[&*I] = ++i; DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n"); } + ReversePostOrderTraversal RPOT(&F); for (BasicBlock *BB : RPOT) { unsigned BBRank = RankMap[BB] = ++i << 16; // Walk the basic block, adding precomputed ranks for any instructions that // we cannot move. This ensures that the ranks for these instructions are // all different in the block. for (Instruction &I : *BB) if (mayBeMemoryDependent(I)) ValueRankMap[&I] = ++BBRank; } } unsigned ReassociatePass::getRank(Value *V) { Instruction *I = dyn_cast(V); if (!I) { if (isa(V)) return ValueRankMap[V]; // Function argument. return 0; // Otherwise it's a global or constant, rank 0. } if (unsigned Rank = ValueRankMap[I]) return Rank; // Rank already known? // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that // we can reassociate expressions for code motion! Since we do not recurse // for PHI nodes, we cannot have infinite recursion here, because there // cannot be loops in the value graph that do not go through PHI nodes. unsigned Rank = 0, MaxRank = RankMap[I->getParent()]; for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i) Rank = std::max(Rank, getRank(I->getOperand(i))); // If this is a not or neg instruction, do not count it for rank. This // assures us that X and ~X will have the same rank. if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && !BinaryOperator::isFNeg(I)) ++Rank; DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n"); return ValueRankMap[I] = Rank; } // Canonicalize constants to RHS. Otherwise, sort the operands by rank. void ReassociatePass::canonicalizeOperands(Instruction *I) { assert(isa(I) && "Expected binary operator."); assert(I->isCommutative() && "Expected commutative operator."); Value *LHS = I->getOperand(0); Value *RHS = I->getOperand(1); unsigned LHSRank = getRank(LHS); unsigned RHSRank = getRank(RHS); if (isa(RHS)) return; if (isa(LHS) || RHSRank < LHSRank) cast(I)->swapOperands(); } static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore); else { BinaryOperator *Res = BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore); Res->setFastMathFlags(cast(FlagsOp)->getFastMathFlags()); return Res; } } static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore); else { BinaryOperator *Res = BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore); Res->setFastMathFlags(cast(FlagsOp)->getFastMathFlags()); return Res; } } static BinaryOperator *CreateNeg(Value *S1, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateNeg(S1, Name, InsertBefore); else { BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore); Res->setFastMathFlags(cast(FlagsOp)->getFastMathFlags()); return Res; } } /// Replace 0-X with X*-1. static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { Type *Ty = Neg->getType(); Constant *NegOne = Ty->isIntOrIntVectorTy() ? ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0); BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg); Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op. Res->takeName(Neg); Neg->replaceAllUsesWith(Res); Res->setDebugLoc(Neg->getDebugLoc()); return Res; } /// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael /// function. This means that x^(2^k) === 1 mod 2^Bitwidth for /// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. /// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every /// even x in Bitwidth-bit arithmetic. static unsigned CarmichaelShift(unsigned Bitwidth) { if (Bitwidth < 3) return Bitwidth - 1; return Bitwidth - 2; } /// Add the extra weight 'RHS' to the existing weight 'LHS', /// reducing the combined weight using any special properties of the operation. /// The existing weight LHS represents the computation X op X op ... op X where /// X occurs LHS times. The combined weight represents X op X op ... op X with /// X occurring LHS + RHS times. If op is "Xor" for example then the combined /// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; /// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { // If we were working with infinite precision arithmetic then the combined // weight would be LHS + RHS. But we are using finite precision arithmetic, // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct // for nilpotent operations and addition, but not for idempotent operations // and multiplication), so it is important to correctly reduce the combined // weight back into range if wrapping would be wrong. // If RHS is zero then the weight didn't change. if (RHS.isMinValue()) return; // If LHS is zero then the combined weight is RHS. if (LHS.isMinValue()) { LHS = RHS; return; } // From this point on we know that neither LHS nor RHS is zero. if (Instruction::isIdempotent(Opcode)) { // Idempotent means X op X === X, so any non-zero weight is equivalent to a // weight of 1. Keeping weights at zero or one also means that wrapping is // not a problem. assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); return; // Return a weight of 1. } if (Instruction::isNilpotent(Opcode)) { // Nilpotent means X op X === 0, so reduce weights modulo 2. assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); LHS = 0; // 1 + 1 === 0 modulo 2. return; } if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) { // TODO: Reduce the weight by exploiting nsw/nuw? LHS += RHS; return; } assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) && "Unknown associative operation!"); unsigned Bitwidth = LHS.getBitWidth(); // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth // bit number x, since either x is odd in which case x^CM = 1, or x is even in // which case both x^W and x^(W - CM) are zero. By subtracting off multiples // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) // which by a happy accident means that they can always be represented using // Bitwidth bits. // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than // the Carmichael number). if (Bitwidth > 3) { /// CM - The value of Carmichael's lambda function. APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); // Any weight W >= Threshold can be replaced with W - CM. APInt Threshold = CM + Bitwidth; assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); // For Bitwidth 4 or more the following sum does not overflow. LHS += RHS; while (LHS.uge(Threshold)) LHS -= CM; } else { // To avoid problems with overflow do everything the same as above but using // a larger type. unsigned CM = 1U << CarmichaelShift(Bitwidth); unsigned Threshold = CM + Bitwidth; assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && "Weights not reduced!"); unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); while (Total >= Threshold) Total -= CM; LHS = Total; } } typedef std::pair RepeatedValue; /// Given an associative binary expression, return the leaf /// nodes in Ops along with their weights (how many times the leaf occurs). The /// original expression is the same as /// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times /// op /// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times /// op /// ... /// op /// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times /// /// Note that the values Ops[0].first, ..., Ops[N].first are all distinct. /// /// This routine may modify the function, in which case it returns 'true'. The /// changes it makes may well be destructive, changing the value computed by 'I' /// to something completely different. Thus if the routine returns 'true' then /// you MUST either replace I with a new expression computed from the Ops array, /// or use RewriteExprTree to put the values back in. /// /// A leaf node is either not a binary operation of the same kind as the root /// node 'I' (i.e. is not a binary operator at all, or is, but with a different /// opcode), or is the same kind of binary operator but has a use which either /// does not belong to the expression, or does belong to the expression but is /// a leaf node. Every leaf node has at least one use that is a non-leaf node /// of the expression, while for non-leaf nodes (except for the root 'I') every /// use is a non-leaf node of the expression. /// /// For example: /// expression graph node names /// /// + | I /// / \ | /// + + | A, B /// / \ / \ | /// * + * | C, D, E /// / \ / \ / \ | /// + * | F, G /// /// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in /// that order) (C, 1), (E, 1), (F, 2), (G, 2). /// /// The expression is maximal: if some instruction is a binary operator of the /// same kind as 'I', and all of its uses are non-leaf nodes of the expression, /// then the instruction also belongs to the expression, is not a leaf node of /// it, and its operands also belong to the expression (but may be leaf nodes). /// /// NOTE: This routine will set operands of non-leaf non-root nodes to undef in /// order to ensure that every non-root node in the expression has *exactly one* /// use by a non-leaf node of the expression. This destruction means that the /// caller MUST either replace 'I' with a new expression or use something like /// RewriteExprTree to put the values back in if the routine indicates that it /// made a change by returning 'true'. /// /// In the above example either the right operand of A or the left operand of B /// will be replaced by undef. If it is B's operand then this gives: /// /// + | I /// / \ | /// + + | A, B - operand of B replaced with undef /// / \ \ | /// * + * | C, D, E /// / \ / \ / \ | /// + * | F, G /// /// Note that such undef operands can only be reached by passing through 'I'. /// For example, if you visit operands recursively starting from a leaf node /// then you will never see such an undef operand unless you get back to 'I', /// which requires passing through a phi node. /// /// Note that this routine may also mutate binary operators of the wrong type /// that have all uses inside the expression (i.e. only used by non-leaf nodes /// of the expression) if it can turn them into binary operators of the right /// type and thus make the expression bigger. static bool LinearizeExprTree(BinaryOperator *I, SmallVectorImpl &Ops) { DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); unsigned Opcode = I->getOpcode(); assert(I->isAssociative() && I->isCommutative() && "Expected an associative and commutative operation!"); // Visit all operands of the expression, keeping track of their weight (the // number of paths from the expression root to the operand, or if you like // the number of times that operand occurs in the linearized expression). // For example, if I = X + A, where X = A + B, then I, X and B have weight 1 // while A has weight two. // Worklist of non-leaf nodes (their operands are in the expression too) along // with their weights, representing a certain number of paths to the operator. // If an operator occurs in the worklist multiple times then we found multiple // ways to get to it. SmallVector, 8> Worklist; // (Op, Weight) Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); bool Changed = false; // Leaves of the expression are values that either aren't the right kind of // operation (eg: a constant, or a multiply in an add tree), or are, but have // some uses that are not inside the expression. For example, in I = X + X, // X = A + B, the value X has two uses (by I) that are in the expression. If // X has any other uses, for example in a return instruction, then we consider // X to be a leaf, and won't analyze it further. When we first visit a value, // if it has more than one use then at first we conservatively consider it to // be a leaf. Later, as the expression is explored, we may discover some more // uses of the value from inside the expression. If all uses turn out to be // from within the expression (and the value is a binary operator of the right // kind) then the value is no longer considered to be a leaf, and its operands // are explored. // Leaves - Keeps track of the set of putative leaves as well as the number of // paths to each leaf seen so far. typedef DenseMap LeafMap; LeafMap Leaves; // Leaf -> Total weight so far. SmallVector LeafOrder; // Ensure deterministic leaf output order. #ifndef NDEBUG SmallPtrSet Visited; // For sanity checking the iteration scheme. #endif while (!Worklist.empty()) { std::pair P = Worklist.pop_back_val(); I = P.first; // We examine the operands of this binary operator. for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands. Value *Op = I->getOperand(OpIdx); APInt Weight = P.second; // Number of paths to this operand. DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); assert(!Op->use_empty() && "No uses, so how did we get to it?!"); // If this is a binary operation of the right kind with only one use then // add its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { assert(Visited.insert(Op).second && "Not first visit!"); DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); Worklist.push_back(std::make_pair(BO, Weight)); continue; } // Appears to be a leaf. Is the operand already in the set of leaves? LeafMap::iterator It = Leaves.find(Op); if (It == Leaves.end()) { // Not in the leaf map. Must be the first time we saw this operand. assert(Visited.insert(Op).second && "Not first visit!"); if (!Op->hasOneUse()) { // This value has uses not accounted for by the expression, so it is // not safe to modify. Mark it as being a leaf. DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); LeafOrder.push_back(Op); Leaves[Op] = Weight; continue; } // No uses outside the expression, try morphing it. } else if (It != Leaves.end()) { // Already in the leaf map. assert(Visited.count(Op) && "In leaf map but not visited!"); // Update the number of paths to the leaf. IncorporateWeight(It->second, Weight, Opcode); #if 0 // TODO: Re-enable once PR13021 is fixed. // The leaf already has one use from inside the expression. As we want // exactly one such use, drop this new use of the leaf. assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); I->setOperand(OpIdx, UndefValue::get(I->getType())); Changed = true; // If the leaf is a binary operation of the right kind and we now see // that its multiple original uses were in fact all by nodes belonging // to the expression, then no longer consider it to be a leaf and add // its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); Worklist.push_back(std::make_pair(BO, It->second)); Leaves.erase(It); continue; } #endif // If we still have uses that are not accounted for by the expression // then it is not safe to modify the value. if (!Op->hasOneUse()) continue; // No uses outside the expression, try morphing it. Weight = It->second; Leaves.erase(It); // Since the value may be morphed below. } // At this point we have a value which, first of all, is not a binary // expression of the right kind, and secondly, is only used inside the // expression. This means that it can safely be modified. See if we // can usefully morph it into an expression of the right kind. assert((!isa(Op) || cast(Op)->getOpcode() != Opcode || (isa(Op) && !cast(Op)->hasUnsafeAlgebra())) && "Should have been handled above!"); assert(Op->hasOneUse() && "Has uses outside the expression tree!"); // If this is a multiply expression, turn any internal negations into // multiplies by -1 so they can be reassociated. if (BinaryOperator *BO = dyn_cast(Op)) if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) || (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) { DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); BO = LowerNegateToMultiply(BO); DEBUG(dbgs() << *BO << '\n'); Worklist.push_back(std::make_pair(BO, Weight)); Changed = true; continue; } // Failed to morph into an expression of the right type. This really is // a leaf. DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); assert(!isReassociableOp(Op, Opcode) && "Value was morphed?"); LeafOrder.push_back(Op); Leaves[Op] = Weight; } } // The leaves, repeated according to their weights, represent the linearized // form of the expression. for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { Value *V = LeafOrder[i]; LeafMap::iterator It = Leaves.find(V); if (It == Leaves.end()) // Node initially thought to be a leaf wasn't. continue; assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!"); APInt Weight = It->second; if (Weight.isMinValue()) // Leaf already output or weight reduction eliminated it. continue; // Ensure the leaf is only output once. It->second = 0; Ops.push_back(std::make_pair(V, Weight)); } // For nilpotent operations or addition there may be no operands, for example // because the expression was "X xor X" or consisted of 2^Bitwidth additions: // in both cases the weight reduces to 0 causing the value to be skipped. if (Ops.empty()) { Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); assert(Identity && "Associative operation without identity!"); Ops.emplace_back(Identity, APInt(Bitwidth, 1)); } return Changed; } /// Now that the operands for this expression tree are /// linearized and optimized, emit them in-order. void ReassociatePass::RewriteExprTree(BinaryOperator *I, SmallVectorImpl &Ops) { assert(Ops.size() > 1 && "Single values should be used directly!"); // Since our optimizations should never increase the number of operations, the // new expression can usually be written reusing the existing binary operators // from the original expression tree, without creating any new instructions, // though the rewritten expression may have a completely different topology. // We take care to not change anything if the new expression will be the same // as the original. If more than trivial changes (like commuting operands) // were made then we are obliged to clear out any optional subclass data like // nsw flags. /// NodesToRewrite - Nodes from the original expression available for writing /// the new expression into. SmallVector NodesToRewrite; unsigned Opcode = I->getOpcode(); BinaryOperator *Op = I; /// NotRewritable - The operands being written will be the leaves of the new /// expression and must not be used as inner nodes (via NodesToRewrite) by /// mistake. Inner nodes are always reassociable, and usually leaves are not /// (if they were they would have been incorporated into the expression and so /// would not be leaves), so most of the time there is no danger of this. But /// in rare cases a leaf may become reassociable if an optimization kills uses /// of it, or it may momentarily become reassociable during rewriting (below) /// due it being removed as an operand of one of its uses. Ensure that misuse /// of leaf nodes as inner nodes cannot occur by remembering all of the future /// leaves and refusing to reuse any of them as inner nodes. SmallPtrSet NotRewritable; for (unsigned i = 0, e = Ops.size(); i != e; ++i) NotRewritable.insert(Ops[i].Op); // ExpressionChanged - Non-null if the rewritten expression differs from the // original in some non-trivial way, requiring the clearing of optional flags. // Flags are cleared from the operator in ExpressionChanged up to I inclusive. BinaryOperator *ExpressionChanged = nullptr; for (unsigned i = 0; ; ++i) { // The last operation (which comes earliest in the IR) is special as both // operands will come from Ops, rather than just one with the other being // a subexpression. if (i+2 == Ops.size()) { Value *NewLHS = Ops[i].Op; Value *NewRHS = Ops[i+1].Op; Value *OldLHS = Op->getOperand(0); Value *OldRHS = Op->getOperand(1); if (NewLHS == OldLHS && NewRHS == OldRHS) // Nothing changed, leave it alone. break; if (NewLHS == OldRHS && NewRHS == OldLHS) { // The order of the operands was reversed. Swap them. DEBUG(dbgs() << "RA: " << *Op << '\n'); Op->swapOperands(); DEBUG(dbgs() << "TO: " << *Op << '\n'); MadeChange = true; ++NumChanged; break; } // The new operation differs non-trivially from the original. Overwrite // the old operands with the new ones. DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewLHS != OldLHS) { BinaryOperator *BO = isReassociableOp(OldLHS, Opcode); if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(0, NewLHS); } if (NewRHS != OldRHS) { BinaryOperator *BO = isReassociableOp(OldRHS, Opcode); if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); } DEBUG(dbgs() << "TO: " << *Op << '\n'); ExpressionChanged = Op; MadeChange = true; ++NumChanged; break; } // Not the last operation. The left-hand side will be a sub-expression // while the right-hand side will be the current element of Ops. Value *NewRHS = Ops[i].Op; if (NewRHS != Op->getOperand(1)) { DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewRHS == Op->getOperand(0)) { // The new right-hand side was already present as the left operand. If // we are lucky then swapping the operands will sort out both of them. Op->swapOperands(); } else { // Overwrite with the new right-hand side. BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode); if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); ExpressionChanged = Op; } DEBUG(dbgs() << "TO: " << *Op << '\n'); MadeChange = true; ++NumChanged; } // Now deal with the left-hand side. If this is already an operation node // from the original expression then just rewrite the rest of the expression // into it. BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode); if (BO && !NotRewritable.count(BO)) { Op = BO; continue; } // Otherwise, grab a spare node from the original expression and use that as // the left-hand side. If there are no nodes left then the optimizers made // an expression with more nodes than the original! This usually means that // they did something stupid but it might mean that the problem was just too // hard (finding the mimimal number of multiplications needed to realize a // multiplication expression is NP-complete). Whatever the reason, smart or // stupid, create a new node if there are none left. BinaryOperator *NewOp; if (NodesToRewrite.empty()) { Constant *Undef = UndefValue::get(I->getType()); NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Undef, Undef, "", I); if (NewOp->getType()->isFPOrFPVectorTy()) NewOp->setFastMathFlags(I->getFastMathFlags()); } else { NewOp = NodesToRewrite.pop_back_val(); } DEBUG(dbgs() << "RA: " << *Op << '\n'); Op->setOperand(0, NewOp); DEBUG(dbgs() << "TO: " << *Op << '\n'); ExpressionChanged = Op; MadeChange = true; ++NumChanged; Op = NewOp; } // If the expression changed non-trivially then clear out all subclass data // starting from the operator specified in ExpressionChanged, and compactify // the operators to just before the expression root to guarantee that the // expression tree is dominated by all of Ops. if (ExpressionChanged) do { // Preserve FastMathFlags. if (isa(I)) { FastMathFlags Flags = I->getFastMathFlags(); ExpressionChanged->clearSubclassOptionalData(); ExpressionChanged->setFastMathFlags(Flags); } else ExpressionChanged->clearSubclassOptionalData(); if (ExpressionChanged == I) break; ExpressionChanged->moveBefore(I); ExpressionChanged = cast(*ExpressionChanged->user_begin()); } while (1); // Throw away any left over nodes from the original expression. for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) RedoInsts.insert(NodesToRewrite[i]); } /// Insert instructions before the instruction pointed to by BI, /// that computes the negative version of the value specified. The negative /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. /// Also add intermediate instructions to the redo list that are modified while /// pushing the negates through adds. These will be revisited to see if /// additional opportunities have been exposed. static Value *NegateValue(Value *V, Instruction *BI, SetVector> &ToRedo) { if (Constant *C = dyn_cast(V)) { if (C->getType()->isFPOrFPVectorTy()) { return ConstantExpr::getFNeg(C); } return ConstantExpr::getNeg(C); } // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an // expression chain as possible, to expose the add instructions. In practice, // this means that we turn this: // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate // the constants. We assume that instcombine will clean up the mess later if // we introduce tons of unnecessary negation instructions. // if (BinaryOperator *I = isReassociableOp(V, Instruction::Add, Instruction::FAdd)) { // Push the negates through the add. I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo)); I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo)); if (I->getOpcode() == Instruction::Add) { I->setHasNoUnsignedWrap(false); I->setHasNoSignedWrap(false); } // We must move the add instruction here, because the neg instructions do // not dominate the old add instruction in general. By moving it, we are // assured that the neg instructions we just inserted dominate the // instruction we are about to insert after them. // I->moveBefore(BI); I->setName(I->getName()+".neg"); // Add the intermediate negates to the redo list as processing them later // could expose more reassociating opportunities. ToRedo.insert(I); return I; } // Okay, we need to materialize a negated version of V with an instruction. // Scan the use lists of V to see if we have one already. for (User *U : V->users()) { if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U)) continue; // We found one! Now we have to make sure that the definition dominates // this use. We do this by moving it to the entry block (if it is a // non-instruction value) or right after the definition. These negates will // be zapped by reassociate later, so we don't need much finesse here. BinaryOperator *TheNeg = cast(U); // Verify that the negate is in this function, V might be a constant expr. if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) continue; BasicBlock::iterator InsertPt; if (Instruction *InstInput = dyn_cast(V)) { if (InvokeInst *II = dyn_cast(InstInput)) { InsertPt = II->getNormalDest()->begin(); } else { InsertPt = ++InstInput->getIterator(); } while (isa(InsertPt)) ++InsertPt; } else { InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); } TheNeg->moveBefore(&*InsertPt); if (TheNeg->getOpcode() == Instruction::Sub) { TheNeg->setHasNoUnsignedWrap(false); TheNeg->setHasNoSignedWrap(false); } else { TheNeg->andIRFlags(BI); } ToRedo.insert(TheNeg); return TheNeg; } // Insert a 'neg' instruction that subtracts the value from zero to get the // negation. BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); ToRedo.insert(NewNeg); return NewNeg; } /// Return true if we should break up this subtract of X-Y into (X + -Y). static bool ShouldBreakUpSubtract(Instruction *Sub) { // If this is a negation, we can't split it up! if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub)) return false; // Don't breakup X - undef. if (isa(Sub->getOperand(1))) return false; // Don't bother to break this up unless either the LHS is an associable add or // subtract or if this is only used by one. Value *V0 = Sub->getOperand(0); if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) || isReassociableOp(V0, Instruction::Sub, Instruction::FSub)) return true; Value *V1 = Sub->getOperand(1); if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) || isReassociableOp(V1, Instruction::Sub, Instruction::FSub)) return true; Value *VB = Sub->user_back(); if (Sub->hasOneUse() && (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) || isReassociableOp(VB, Instruction::Sub, Instruction::FSub))) return true; return false; } /// If we have (X-Y), and if either X is an add, or if this is only used by an /// add, transform this into (X+(0-Y)) to promote better reassociation. static BinaryOperator * BreakUpSubtract(Instruction *Sub, SetVector> &ToRedo) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // // Calculate the negative value of Operand 1 of the sub instruction, // and set it as the RHS of the add instruction we just made. // Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. New->takeName(Sub); // Everyone now refers to the add instruction. Sub->replaceAllUsesWith(New); New->setDebugLoc(Sub->getDebugLoc()); DEBUG(dbgs() << "Negated: " << *New << '\n'); return New; } /// If this is a shift of a reassociable multiply or is used by one, change /// this into a multiply by a constant to assist with further reassociation. static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { Constant *MulCst = ConstantInt::get(Shl->getType(), 1); MulCst = ConstantExpr::getShl(MulCst, cast(Shl->getOperand(1))); BinaryOperator *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. Mul->takeName(Shl); // Everyone now refers to the mul instruction. Shl->replaceAllUsesWith(Mul); Mul->setDebugLoc(Shl->getDebugLoc()); // We can safely preserve the nuw flag in all cases. It's also safe to turn a // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special // handling. bool NSW = cast(Shl)->hasNoSignedWrap(); bool NUW = cast(Shl)->hasNoUnsignedWrap(); if (NSW && NUW) Mul->setHasNoSignedWrap(true); Mul->setHasNoUnsignedWrap(NUW); return Mul; } /// Scan backwards and forwards among values with the same rank as element i /// to see if X exists. If X does not exist, return i. This is useful when /// scanning for 'x' when we see '-x' because they both get the same rank. static unsigned FindInOperandList(SmallVectorImpl &Ops, unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; unsigned e = Ops.size(); for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) { if (Ops[j].Op == X) return j; if (Instruction *I1 = dyn_cast(Ops[j].Op)) if (Instruction *I2 = dyn_cast(X)) if (I1->isIdenticalTo(I2)) return j; } // Scan backwards. for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) { if (Ops[j].Op == X) return j; if (Instruction *I1 = dyn_cast(Ops[j].Op)) if (Instruction *I2 = dyn_cast(X)) if (I1->isIdenticalTo(I2)) return j; } return i; } /// Emit a tree of add instructions, summing Ops together /// and returning the result. Insert the tree before I. static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl &Ops){ if (Ops.size() == 1) return Ops.back(); Value *V1 = Ops.back(); Ops.pop_back(); Value *V2 = EmitAddTreeOfValues(I, Ops); return CreateAdd(V2, V1, "tmp", I, I); } /// If V is an expression tree that is a multiplication sequence, /// and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); if (!BO) return nullptr; SmallVector Tree; MadeChange |= LinearizeExprTree(BO, Tree); SmallVector Factors; Factors.reserve(Tree.size()); for (unsigned i = 0, e = Tree.size(); i != e; ++i) { RepeatedValue E = Tree[i]; Factors.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first)); } bool FoundFactor = false; bool NeedsNegate = false; for (unsigned i = 0, e = Factors.size(); i != e; ++i) { if (Factors[i].Op == Factor) { FoundFactor = true; Factors.erase(Factors.begin()+i); break; } // If this is a negative version of this factor, remove it. if (ConstantInt *FC1 = dyn_cast(Factor)) { if (ConstantInt *FC2 = dyn_cast(Factors[i].Op)) if (FC1->getValue() == -FC2->getValue()) { FoundFactor = NeedsNegate = true; Factors.erase(Factors.begin()+i); break; } } else if (ConstantFP *FC1 = dyn_cast(Factor)) { if (ConstantFP *FC2 = dyn_cast(Factors[i].Op)) { const APFloat &F1 = FC1->getValueAPF(); APFloat F2(FC2->getValueAPF()); F2.changeSign(); if (F1.compare(F2) == APFloat::cmpEqual) { FoundFactor = NeedsNegate = true; Factors.erase(Factors.begin() + i); break; } } } } if (!FoundFactor) { // Make sure to restore the operands to the expression tree. RewriteExprTree(BO, Factors); return nullptr; } BasicBlock::iterator InsertPt = ++BO->getIterator(); // If this was just a single multiply, remove the multiply and return the only // remaining operand. if (Factors.size() == 1) { RedoInsts.insert(BO); V = Factors[0].Op; } else { RewriteExprTree(BO, Factors); V = BO; } if (NeedsNegate) V = CreateNeg(V, "neg", &*InsertPt, BO); return V; } /// If V is a single-use multiply, recursively add its operands as factors, /// otherwise add V to the list of factors. /// /// Ops is the top-level list of add operands we're trying to factor. static void FindSingleUseMultiplyFactors(Value *V, SmallVectorImpl &Factors, const SmallVectorImpl &Ops) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); if (!BO) { Factors.push_back(V); return; } // Otherwise, add the LHS and RHS to the list of factors. FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops); FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops); } /// Optimize a series of operands to an 'and', 'or', or 'xor' instruction. /// This optimizes based on identities. If it can be reduced to a single Value, /// it is returned, otherwise the Ops list is mutated as necessary. static Value *OptimizeAndOrXor(unsigned Opcode, SmallVectorImpl &Ops) { // Scan the operand lists looking for X and ~X pairs, along with X,X pairs. // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1. for (unsigned i = 0, e = Ops.size(); i != e; ++i) { // First, check for X and ~X in the operand list. assert(i < Ops.size()); if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^. Value *X = BinaryOperator::getNotArgument(Ops[i].Op); unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX != i) { if (Opcode == Instruction::And) // ...&X&~X = 0 return Constant::getNullValue(X->getType()); if (Opcode == Instruction::Or) // ...|X|~X = -1 return Constant::getAllOnesValue(X->getType()); } } // Next, check for duplicate pairs of values, which we assume are next to // each other, due to our sorting criteria. assert(i < Ops.size()); if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) { if (Opcode == Instruction::And || Opcode == Instruction::Or) { // Drop duplicate values for And and Or. Ops.erase(Ops.begin()+i); --i; --e; ++NumAnnihil; continue; } // Drop pairs of values for Xor. assert(Opcode == Instruction::Xor); if (e == 2) return Constant::getNullValue(Ops[0].Op->getType()); // Y ^ X^X -> Y Ops.erase(Ops.begin()+i, Ops.begin()+i+2); i -= 1; e -= 2; ++NumAnnihil; } } return nullptr; } /// Helper function of CombineXorOpnd(). It creates a bitwise-and /// instruction with the given two operands, and return the resulting /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will /// be returned. static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, const APInt &ConstOpnd) { if (ConstOpnd != 0) { if (!ConstOpnd.isAllOnesValue()) { LLVMContext &Ctx = Opnd->getType()->getContext(); Instruction *I; I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd), "and.ra", InsertBefore); I->setDebugLoc(InsertBefore->getDebugLoc()); return I; } return Opnd; } return nullptr; } // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" // into "R ^ C", where C would be 0, and R is a symbolic value. // // If it was successful, true is returned, and the "R" and "C" is returned // via "Res" and "ConstOpnd", respectively; otherwise, false is returned, // and both "Res" and "ConstOpnd" remain unchanged. // bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd, Value *&Res) { // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 // = ((x | c1) ^ c1) ^ (c1 ^ c2) // = (x & ~c1) ^ (c1 ^ c2) // It is useful only when c1 == c2. if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) { if (!Opnd1->getValue()->hasOneUse()) return false; const APInt &C1 = Opnd1->getConstPart(); if (C1 != ConstOpnd) return false; Value *X = Opnd1->getSymbolicPart(); Res = createAndInstr(I, X, ~C1); // ConstOpnd was C2, now C1 ^ C2. ConstOpnd ^= C1; if (Instruction *T = dyn_cast(Opnd1->getValue())) RedoInsts.insert(T); return true; } return false; } // Helper function of OptimizeXor(). It tries to simplify // "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a // symbolic value. // // If it was successful, true is returned, and the "R" and "C" is returned // via "Res" and "ConstOpnd", respectively (If the entire expression is // evaluated to a constant, the Res is set to NULL); otherwise, false is // returned, and both "Res" and "ConstOpnd" remain unchanged. bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2, APInt &ConstOpnd, Value *&Res) { Value *X = Opnd1->getSymbolicPart(); if (X != Opnd2->getSymbolicPart()) return false; // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.) int DeadInstNum = 1; if (Opnd1->getValue()->hasOneUse()) DeadInstNum++; if (Opnd2->getValue()->hasOneUse()) DeadInstNum++; // Xor-Rule 2: // (x | c1) ^ (x & c2) // = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1 // = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1 // = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3 // if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) { if (Opnd2->isOrExpr()) std::swap(Opnd1, Opnd2); const APInt &C1 = Opnd1->getConstPart(); const APInt &C2 = Opnd2->getConstPart(); APInt C3((~C1) ^ C2); // Do not increase code size! if (C3 != 0 && !C3.isAllOnesValue()) { int NewInstNum = ConstOpnd != 0 ? 1 : 2; if (NewInstNum > DeadInstNum) return false; } Res = createAndInstr(I, X, C3); ConstOpnd ^= C1; } else if (Opnd1->isOrExpr()) { // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2 // const APInt &C1 = Opnd1->getConstPart(); const APInt &C2 = Opnd2->getConstPart(); APInt C3 = C1 ^ C2; // Do not increase code size if (C3 != 0 && !C3.isAllOnesValue()) { int NewInstNum = ConstOpnd != 0 ? 1 : 2; if (NewInstNum > DeadInstNum) return false; } Res = createAndInstr(I, X, C3); ConstOpnd ^= C3; } else { // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2)) // const APInt &C1 = Opnd1->getConstPart(); const APInt &C2 = Opnd2->getConstPart(); APInt C3 = C1 ^ C2; Res = createAndInstr(I, X, C3); } // Put the original operands in the Redo list; hope they will be deleted // as dead code. if (Instruction *T = dyn_cast(Opnd1->getValue())) RedoInsts.insert(T); if (Instruction *T = dyn_cast(Opnd2->getValue())) RedoInsts.insert(T); return true; } /// Optimize a series of operands to an 'xor' instruction. If it can be reduced /// to a single Value, it is returned, otherwise the Ops list is mutated as /// necessary. Value *ReassociatePass::OptimizeXor(Instruction *I, SmallVectorImpl &Ops) { if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops)) return V; if (Ops.size() == 1) return nullptr; SmallVector Opnds; SmallVector OpndPtrs; Type *Ty = Ops[0].Op->getType(); APInt ConstOpnd(Ty->getIntegerBitWidth(), 0); // Step 1: Convert ValueEntry to XorOpnd for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *V = Ops[i].Op; if (!isa(V)) { XorOpnd O(V); O.setSymbolicRank(getRank(O.getSymbolicPart())); Opnds.push_back(O); } else ConstOpnd ^= cast(V)->getValue(); } // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds". // It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate // the "OpndPtrs" as well. For the similar reason, do not fuse this loop // with the previous loop --- the iterator of the "Opnds" may be invalidated // when new elements are added to the vector. for (unsigned i = 0, e = Opnds.size(); i != e; ++i) OpndPtrs.push_back(&Opnds[i]); // Step 2: Sort the Xor-Operands in a way such that the operands containing // the same symbolic value cluster together. For instance, the input operand // sequence ("x | 123", "y & 456", "x & 789") will be sorted into: // ("x | 123", "x & 789", "y & 456"). // // The purpose is twofold: // 1) Cluster together the operands sharing the same symbolic-value. // 2) Operand having smaller symbolic-value-rank is permuted earlier, which // could potentially shorten crital path, and expose more loop-invariants. // Note that values' rank are basically defined in RPO order (FIXME). // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2", // "z" in the order of X-Y-Z is better than any other orders. std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), [](XorOpnd *LHS, XorOpnd *RHS) { return LHS->getSymbolicRank() < RHS->getSymbolicRank(); }); // Step 3: Combine adjacent operands XorOpnd *PrevOpnd = nullptr; bool Changed = false; for (unsigned i = 0, e = Opnds.size(); i < e; i++) { XorOpnd *CurrOpnd = OpndPtrs[i]; // The combined value Value *CV; // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { Changed = true; if (CV) *CurrOpnd = XorOpnd(CV); else { CurrOpnd->Invalidate(); continue; } } if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) { PrevOpnd = CurrOpnd; continue; } // step 3.2: When previous and current operands share the same symbolic // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" // if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) { // Remove previous operand PrevOpnd->Invalidate(); if (CV) { *CurrOpnd = XorOpnd(CV); PrevOpnd = CurrOpnd; } else { CurrOpnd->Invalidate(); PrevOpnd = nullptr; } Changed = true; } } // Step 4: Reassemble the Ops if (Changed) { Ops.clear(); for (unsigned int i = 0, e = Opnds.size(); i < e; i++) { XorOpnd &O = Opnds[i]; if (O.isInvalid()) continue; ValueEntry VE(getRank(O.getValue()), O.getValue()); Ops.push_back(VE); } if (ConstOpnd != 0) { Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd); ValueEntry VE(getRank(C), C); Ops.push_back(VE); } int Sz = Ops.size(); if (Sz == 1) return Ops.back().Op; else if (Sz == 0) { assert(ConstOpnd == 0); return ConstantInt::get(Ty->getContext(), ConstOpnd); } } return nullptr; } /// Optimize a series of operands to an 'add' instruction. This /// optimizes based on identities. If it can be reduced to a single Value, it /// is returned, otherwise the Ops list is mutated as necessary. Value *ReassociatePass::OptimizeAdd(Instruction *I, SmallVectorImpl &Ops) { // Scan the operand lists looking for X and -X pairs. If we find any, we // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it, // scan for any // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *TheOp = Ops[i].Op; // Check to see if we've seen this operand before. If so, we factor all // instances of the operand together. Due to our sorting criteria, we know // that these need to be next to each other in the vector. if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) { // Rescan the list, remove all instances of this operand from the expr. unsigned NumFound = 0; do { Ops.erase(Ops.begin()+i); ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); ++NumFactor; // Insert a new multiply. Type *Ty = TheOp->getType(); Constant *C = Ty->isIntOrIntVectorTy() ? ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound); Instruction *Mul = CreateMul(TheOp, C, "factor", I, I); // Now that we have inserted a multiply, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 RedoInsts.insert(Mul); // If every add operand was a duplicate, return the multiply. if (Ops.empty()) return Mul; // Otherwise, we had some input that didn't have the dupe, such as // "A + A + B" -> "A*2 + B". Add the new multiply to the list of // things being added by this operation. Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul)); --i; e = Ops.size(); continue; } // Check for X and -X or X and ~X in the operand list. if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) && !BinaryOperator::isNot(TheOp)) continue; Value *X = nullptr; if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)) X = BinaryOperator::getNegArgument(TheOp); else if (BinaryOperator::isNot(TheOp)) X = BinaryOperator::getNotArgument(TheOp); unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX == i) continue; // Remove X and -X from the operand list. if (Ops.size() == 2 && (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))) return Constant::getNullValue(X->getType()); // Remove X and ~X from the operand list. if (Ops.size() == 2 && BinaryOperator::isNot(TheOp)) return Constant::getAllOnesValue(X->getType()); Ops.erase(Ops.begin()+i); if (i < FoundX) --FoundX; else --i; // Need to back up an extra one. Ops.erase(Ops.begin()+FoundX); ++NumAnnihil; --i; // Revisit element. e -= 2; // Removed two elements. // if X and ~X we append -1 to the operand list. if (BinaryOperator::isNot(TheOp)) { Value *V = Constant::getAllOnesValue(X->getType()); Ops.insert(Ops.end(), ValueEntry(getRank(V), V)); e += 1; } } // Scan the operand list, checking to see if there are any common factors // between operands. Consider something like A*A+A*B*C+D. We would like to // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies. // To efficiently find this, we count the number of times a factor occurs // for any ADD operands that are MULs. DenseMap FactorOccurrences; // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) // where they are actually the same multiply. unsigned MaxOcc = 0; Value *MaxOccVal = nullptr; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul); if (!BOp) continue; // Compute all of the factors of this added value. SmallVector Factors; FindSingleUseMultiplyFactors(BOp, Factors, Ops); assert(Factors.size() > 1 && "Bad linearize!"); // Add one to FactorOccurrences for each unique factor in this op. SmallPtrSet Duplicates; for (unsigned i = 0, e = Factors.size(); i != e; ++i) { Value *Factor = Factors[i]; if (!Duplicates.insert(Factor).second) continue; unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } // If Factor is a negative constant, add the negated value as a factor // because we can percolate the negate out. Watch for minint, which // cannot be positivified. if (ConstantInt *CI = dyn_cast(Factor)) { if (CI->isNegative() && !CI->isMinValue(true)) { Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); assert(!Duplicates.count(Factor) && "Shouldn't have two constant factors, missed a canonicalize"); unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } } } else if (ConstantFP *CF = dyn_cast(Factor)) { if (CF->isNegative()) { APFloat F(CF->getValueAPF()); F.changeSign(); Factor = ConstantFP::get(CF->getContext(), F); assert(!Duplicates.count(Factor) && "Shouldn't have two constant factors, missed a canonicalize"); unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } } } } } // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); ++NumFactor; // Create a new instruction that uses the MaxOccVal twice. If we don't do // this, we could otherwise run into situations where removing a factor // from an expression will drop a use of maxocc, and this can cause // RemoveFactorFromExpression on successive values to behave differently. Instruction *DummyInst = I->getType()->isIntOrIntVectorTy() ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal) : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal); SmallVector NewMulOps; for (unsigned i = 0; i != Ops.size(); ++i) { // Only try to remove factors from expressions we're allowed to. BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul); if (!BOp) continue; if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { // The factorized operand may occur several times. Convert them all in // one fell swoop. for (unsigned j = Ops.size(); j != i;) { --j; if (Ops[j].Op == Ops[i].Op) { NewMulOps.push_back(V); Ops.erase(Ops.begin()+j); } } --i; } } // No need for extra uses anymore. delete DummyInst; unsigned NumAddedValues = NewMulOps.size(); Value *V = EmitAddTreeOfValues(I, NewMulOps); // Now that we have inserted the add tree, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C)) assert(NumAddedValues > 1 && "Each occurrence should contribute a value"); (void)NumAddedValues; if (Instruction *VI = dyn_cast(V)) RedoInsts.insert(VI); // Create the multiply. Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. RedoInsts.insert(V2); // If every add operand included the factor (e.g. "A*B + A*C"), then the // entire result expression is just the multiply "A*(B+C)". if (Ops.empty()) return V2; // Otherwise, we had some input that didn't have the factor, such as // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of // things being added by this operation. Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); } return nullptr; } /// \brief Build up a vector of value/power pairs factoring a product. /// /// Given a series of multiplication operands, build a vector of factors and /// the powers each is raised to when forming the final product. Sort them in /// the order of descending power. /// /// (x*x) -> [(x, 2)] /// ((x*x)*x) -> [(x, 3)] /// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] /// /// \returns Whether any factors have a power greater than one. bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl &Ops, SmallVectorImpl &Factors) { // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. // Compute the sum of powers of simplifiable factors. unsigned FactorPowerSum = 0; for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) { Value *Op = Ops[Idx-1].Op; // Count the number of occurrences of this value. unsigned Count = 1; for (; Idx < Size && Ops[Idx].Op == Op; ++Idx) ++Count; // Track for simplification all factors which occur 2 or more times. if (Count > 1) FactorPowerSum += Count; } // We can only simplify factors if the sum of the powers of our simplifiable // factors is 4 or higher. When that is the case, we will *always* have // a simplification. This is an important invariant to prevent cyclicly // trying to simplify already minimal formations. if (FactorPowerSum < 4) return false; // Now gather the simplifiable factors, removing them from Ops. FactorPowerSum = 0; for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) { Value *Op = Ops[Idx-1].Op; // Count the number of occurrences of this value. unsigned Count = 1; for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx) ++Count; if (Count == 1) continue; // Move an even number of occurrences to Factors. Count &= ~1U; Idx -= Count; FactorPowerSum += Count; Factors.push_back(Factor(Op, Count)); Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count); } // None of the adjustments above should have reduced the sum of factor powers // below our mininum of '4'. assert(FactorPowerSum >= 4); std::stable_sort(Factors.begin(), Factors.end(), [](const Factor &LHS, const Factor &RHS) { return LHS.Power > RHS.Power; }); return true; } /// \brief Build a tree of multiplies, computing the product of Ops. static Value *buildMultiplyTree(IRBuilder<> &Builder, SmallVectorImpl &Ops) { if (Ops.size() == 1) return Ops.back(); Value *LHS = Ops.pop_back_val(); do { if (LHS->getType()->isIntOrIntVectorTy()) LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); else LHS = Builder.CreateFMul(LHS, Ops.pop_back_val()); } while (!Ops.empty()); return LHS; } /// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... /// /// Given a vector of values raised to various powers, where no two values are /// equal and the powers are sorted in decreasing order, compute the minimal /// DAG of multiplies to compute the final product, and return that product /// value. Value * ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder, SmallVectorImpl &Factors) { assert(Factors[0].Power); SmallVector OuterProduct; for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size(); Idx < Size && Factors[Idx].Power > 0; ++Idx) { if (Factors[Idx].Power != Factors[LastIdx].Power) { LastIdx = Idx; continue; } // We want to multiply across all the factors with the same power so that // we can raise them to that power as a single entity. Build a mini tree // for that. SmallVector InnerProduct; InnerProduct.push_back(Factors[LastIdx].Base); do { InnerProduct.push_back(Factors[Idx].Base); ++Idx; } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power); // Reset the base value of the first factor to the new expression tree. // We'll remove all the factors with the same power in a second pass. Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct); if (Instruction *MI = dyn_cast(M)) RedoInsts.insert(MI); LastIdx = Idx; } // Unique factors with equal powers -- we've folded them into the first one's // base. Factors.erase(std::unique(Factors.begin(), Factors.end(), [](const Factor &LHS, const Factor &RHS) { return LHS.Power == RHS.Power; }), Factors.end()); // Iteratively collect the base of each factor with an add power into the // outer product, and halve each power in preparation for squaring the // expression. for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) { if (Factors[Idx].Power & 1) OuterProduct.push_back(Factors[Idx].Base); Factors[Idx].Power >>= 1; } if (Factors[0].Power) { Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors); OuterProduct.push_back(SquareRoot); OuterProduct.push_back(SquareRoot); } if (OuterProduct.size() == 1) return OuterProduct.front(); Value *V = buildMultiplyTree(Builder, OuterProduct); return V; } Value *ReassociatePass::OptimizeMul(BinaryOperator *I, SmallVectorImpl &Ops) { // We can only optimize the multiplies when there is a chain of more than // three, such that a balanced tree might require fewer total multiplies. if (Ops.size() < 4) return nullptr; // Try to turn linear trees of multiplies without other uses of the // intermediate stages into minimal multiply DAGs with perfect sub-expression // re-use. SmallVector Factors; if (!collectMultiplyFactors(Ops, Factors)) return nullptr; // All distinct factors, so nothing left for us to do. IRBuilder<> Builder(I); Value *V = buildMinimalMultiplyDAG(Builder, Factors); if (Ops.empty()) return V; ValueEntry NewEntry = ValueEntry(getRank(V), V); Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry); return nullptr; } Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, SmallVectorImpl &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. Constant *Cst = nullptr; unsigned Opcode = I->getOpcode(); while (!Ops.empty() && isa(Ops.back().Op)) { Constant *C = cast(Ops.pop_back_val().Op); Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; } // If there was nothing but constants then we are done. if (Ops.empty()) return Cst; // Put the combined constant back at the end of the operand list, except if // there is no point. For example, an add of 0 gets dropped here, while a // multiplication by zero turns the whole expression into zero. if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) { if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType())) return Cst; Ops.push_back(ValueEntry(0, Cst)); } if (Ops.size() == 1) return Ops[0].Op; // Handle destructive annihilation due to identities between elements in the // argument list here. unsigned NumOps = Ops.size(); switch (Opcode) { default: break; case Instruction::And: case Instruction::Or: if (Value *Result = OptimizeAndOrXor(Opcode, Ops)) return Result; break; case Instruction::Xor: if (Value *Result = OptimizeXor(I, Ops)) return Result; break; case Instruction::Add: case Instruction::FAdd: if (Value *Result = OptimizeAdd(I, Ops)) return Result; break; case Instruction::Mul: case Instruction::FMul: if (Value *Result = OptimizeMul(I, Ops)) return Result; break; } if (Ops.size() != NumOps) return OptimizeExpression(I, Ops); return nullptr; } // Remove dead instructions and if any operands are trivially dead add them to // Insts so they will be removed as well. void ReassociatePass::RecursivelyEraseDeadInsts( Instruction *I, SetVector> &Insts) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector Ops(I->op_begin(), I->op_end()); ValueRankMap.erase(I); Insts.remove(I); RedoInsts.remove(I); I->eraseFromParent(); for (auto Op : Ops) if (Instruction *OpInst = dyn_cast(Op)) if (OpInst->use_empty()) Insts.insert(OpInst); } /// Zap the given instruction, adding interesting operands to the work list. void ReassociatePass::EraseInst(Instruction *I) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector Ops(I->op_begin(), I->op_end()); // Erase the dead instruction. ValueRankMap.erase(I); RedoInsts.remove(I); I->eraseFromParent(); // Optimize its operands. SmallPtrSet Visited; // Detect self-referential nodes. for (unsigned i = 0, e = Ops.size(); i != e; ++i) if (Instruction *Op = dyn_cast(Ops[i])) { // If this is a node in an expression tree, climb to the expression root // and add that since that's where optimization actually happens. unsigned Opcode = Op->getOpcode(); while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode && Visited.insert(Op).second) Op = Op->user_back(); RedoInsts.insert(Op); } } // Canonicalize expressions of the following form: // x + (-Constant * y) -> x - (Constant * y) // x - (-Constant * y) -> x + (Constant * y) Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { if (!I->hasOneUse() || I->getType()->isVectorTy()) return nullptr; // Must be a fmul or fdiv instruction. unsigned Opcode = I->getOpcode(); if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv) return nullptr; auto *C0 = dyn_cast(I->getOperand(0)); auto *C1 = dyn_cast(I->getOperand(1)); // Both operands are constant, let it get constant folded away. if (C0 && C1) return nullptr; ConstantFP *CF = C0 ? C0 : C1; // Must have one constant operand. if (!CF) return nullptr; // Must be a negative ConstantFP. if (!CF->isNegative()) return nullptr; // User must be a binary operator with one or more uses. Instruction *User = I->user_back(); if (!isa(User) || !User->hasNUsesOrMore(1)) return nullptr; unsigned UserOpcode = User->getOpcode(); if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub) return nullptr; // Subtraction is not commutative. Explicitly, the following transform is // not valid: (-Constant * y) - x -> x + (Constant * y) if (!User->isCommutative() && User->getOperand(1) != I) return nullptr; // Change the sign of the constant. APFloat Val = CF->getValueAPF(); Val.changeSign(); I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val)); // Canonicalize I to RHS to simplify the next bit of logic. E.g., // ((-Const*y) + x) -> (x + (-Const*y)). if (User->getOperand(0) == I && User->isCommutative()) cast(User)->swapOperands(); Value *Op0 = User->getOperand(0); Value *Op1 = User->getOperand(1); BinaryOperator *NI; switch (UserOpcode) { default: llvm_unreachable("Unexpected Opcode!"); case Instruction::FAdd: NI = BinaryOperator::CreateFSub(Op0, Op1); NI->setFastMathFlags(cast(User)->getFastMathFlags()); break; case Instruction::FSub: NI = BinaryOperator::CreateFAdd(Op0, Op1); NI->setFastMathFlags(cast(User)->getFastMathFlags()); break; } NI->insertBefore(User); NI->setName(User->getName()); User->replaceAllUsesWith(NI); NI->setDebugLoc(I->getDebugLoc()); RedoInsts.insert(I); MadeChange = true; return NI; } /// Inspect and optimize the given instruction. Note that erasing /// instructions is not allowed. void ReassociatePass::OptimizeInst(Instruction *I) { // Only consider operations that we understand. if (!isa(I)) return; if (I->getOpcode() == Instruction::Shl && isa(I->getOperand(1))) // If an operand of this shift is a reassociable multiply, or if the shift // is used by a reassociable multiply or add, turn into a multiply. if (isReassociableOp(I->getOperand(0), Instruction::Mul) || (I->hasOneUse() && (isReassociableOp(I->user_back(), Instruction::Mul) || isReassociableOp(I->user_back(), Instruction::Add)))) { Instruction *NI = ConvertShiftToMul(I); RedoInsts.insert(I); MadeChange = true; I = NI; } // Canonicalize negative constants out of expressions. if (Instruction *Res = canonicalizeNegConstExpr(I)) I = Res; // Commute binary operators, to canonicalize the order of their operands. // This can potentially expose more CSE opportunities, and makes writing other // transformations simpler. if (I->isCommutative()) canonicalizeOperands(I); // TODO: We should optimize vector Xor instructions, but they are // currently unsupported. if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor) return; // Don't optimize floating point instructions that don't have unsafe algebra. if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) return; // Do not reassociate boolean (i1) expressions. We want to preserve the // original order of evaluation for short-circuited comparisons that // SimplifyCFG has folded to AND/OR expressions. If the expression // is not further optimized, it is likely to be transformed back to a // short-circuited form for code gen, and the source order may have been // optimized for the most likely conditions. if (I->getType()->isIntegerTy(1)) return; // If this is a subtract instruction which is not already in negate form, // see if we can convert it to X+-Y. if (I->getOpcode() == Instruction::Sub) { if (ShouldBreakUpSubtract(I)) { Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; } else if (BinaryOperator::isNeg(I)) { // Otherwise, this is a negation. See if the operand is a multiply tree // and if this is not an inner node of a multiply tree. if (isReassociableOp(I->getOperand(1), Instruction::Mul) && (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::Mul))) { Instruction *NI = LowerNegateToMultiply(I); // If the negate was simplified, revisit the users to see if we can // reassociate further. for (User *U : NI->users()) { if (BinaryOperator *Tmp = dyn_cast(U)) RedoInsts.insert(Tmp); } RedoInsts.insert(I); MadeChange = true; I = NI; } } } else if (I->getOpcode() == Instruction::FSub) { if (ShouldBreakUpSubtract(I)) { Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; } else if (BinaryOperator::isFNeg(I)) { // Otherwise, this is a negation. See if the operand is a multiply tree // and if this is not an inner node of a multiply tree. if (isReassociableOp(I->getOperand(1), Instruction::FMul) && (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::FMul))) { // If the negate was simplified, revisit the users to see if we can // reassociate further. Instruction *NI = LowerNegateToMultiply(I); for (User *U : NI->users()) { if (BinaryOperator *Tmp = dyn_cast(U)) RedoInsts.insert(Tmp); } RedoInsts.insert(I); MadeChange = true; I = NI; } } } // If this instruction is an associative binary operator, process it. if (!I->isAssociative()) return; BinaryOperator *BO = cast(I); // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. unsigned Opcode = BO->getOpcode(); if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) { // During the initial run we will get to the root of the tree. // But if we get here while we are redoing instructions, there is no // guarantee that the root will be visited. So Redo later if (BO->user_back() != BO && BO->getParent() == BO->user_back()->getParent()) RedoInsts.insert(BO->user_back()); return; } // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && cast(BO->user_back())->getOpcode() == Instruction::Sub) return; if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd && cast(BO->user_back())->getOpcode() == Instruction::FSub) return; ReassociateExpression(BO); } void ReassociatePass::ReassociateExpression(BinaryOperator *I) { // First, walk the expression tree, linearizing the tree, collecting the // operand information. SmallVector Tree; MadeChange |= LinearizeExprTree(I, Tree); SmallVector Ops; Ops.reserve(Tree.size()); for (unsigned i = 0, e = Tree.size(); i != e; ++i) { RepeatedValue E = Tree[i]; Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first)); } DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); // Now that we have linearized the tree to a list and have gathered all of // the operands and their ranks, sort the operands by their rank. Use a // stable_sort so that values with equal ranks will have their relative // positions maintained (and so the compiler is deterministic). Note that // this sorts so that the highest ranking values end up at the beginning of // the vector. std::stable_sort(Ops.begin(), Ops.end()); // Now that we have the expression tree in a convenient // sorted form, optimize it globally if possible. if (Value *V = OptimizeExpression(I, Ops)) { if (V == I) // Self-referential expression in unreachable code. return; // This expression tree simplified to something that isn't a tree, // eliminate it. DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); I->replaceAllUsesWith(V); if (Instruction *VI = dyn_cast(V)) VI->setDebugLoc(I->getDebugLoc()); RedoInsts.insert(I); ++NumAnnihil; return; } // We want to sink immediates as deeply as possible except in the case where // this is a multiply tree used only by an add, and the immediate is a -1. // In this case we reassociate to put the negation on the outside so that we // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y if (I->hasOneUse()) { if (I->getOpcode() == Instruction::Mul && cast(I->user_back())->getOpcode() == Instruction::Add && isa(Ops.back().Op) && cast(Ops.back().Op)->isAllOnesValue()) { ValueEntry Tmp = Ops.pop_back_val(); Ops.insert(Ops.begin(), Tmp); } else if (I->getOpcode() == Instruction::FMul && cast(I->user_back())->getOpcode() == Instruction::FAdd && isa(Ops.back().Op) && cast(Ops.back().Op)->isExactlyValue(-1.0)) { ValueEntry Tmp = Ops.pop_back_val(); Ops.insert(Ops.begin(), Tmp); } } DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); if (Ops.size() == 1) { if (Ops[0].Op == I) // Self-referential expression in unreachable code. return; // This expression tree simplified to something that isn't a tree, // eliminate it. I->replaceAllUsesWith(Ops[0].Op); if (Instruction *OI = dyn_cast(Ops[0].Op)) OI->setDebugLoc(I->getDebugLoc()); RedoInsts.insert(I); return; } // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. RewriteExprTree(I, Ops); } PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { - // Reassociate needs for each instruction to have its operands already - // processed, so we first perform a RPOT of the basic blocks so that - // when we process a basic block, all its dominators have been processed - // before. - ReversePostOrderTraversal RPOT(&F); - BuildRankMap(F, RPOT); + // Calculate the rank map for F. + BuildRankMap(F); MadeChange = false; - for (BasicBlock *BI : RPOT) { - // Use a worklist to keep track of which instructions have been processed - // (and which insts won't be optimized again) so when redoing insts, - // optimize insts rightaway which won't be processed later. - SmallSet Worklist; - - // Insert all instructions in the BB - for (Instruction &I : *BI) - Worklist.insert(&I); - + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Optimize every instruction in the basic block. - for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;) { - // This instruction has been processed. - Worklist.erase(&*II); + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;) if (isInstructionTriviallyDead(&*II)) { EraseInst(&*II++); } else { OptimizeInst(&*II); assert(II->getParent() == &*BI && "Moved to a different block!"); ++II; } - // If the above optimizations produced new instructions to optimize or - // made modifications which need to be redone, do them now if they won't - // be handled later. - while (!RedoInsts.empty()) { - Instruction *I = RedoInsts.pop_back_val(); - // Process instructions that won't be processed later, either - // inside the block itself or in another basic block (based on rank), - // since these will be processed later. - if ((I->getParent() != BI || !Worklist.count(I)) && - RankMap[I->getParent()] <= RankMap[BI]) { - if (isInstructionTriviallyDead(I)) - EraseInst(I); - else - OptimizeInst(I); - } - } + // Make a copy of all the instructions to be redone so we can remove dead + // instructions. + SetVector> ToRedo(RedoInsts); + // Iterate over all instructions to be reevaluated and remove trivially dead + // instructions. If any operand of the trivially dead instruction becomes + // dead mark it for deletion as well. Continue this process until all + // trivially dead instructions have been removed. + while (!ToRedo.empty()) { + Instruction *I = ToRedo.pop_back_val(); + if (isInstructionTriviallyDead(I)) + RecursivelyEraseDeadInsts(I, ToRedo); + } + + // Now that we have removed dead instructions, we can reoptimize the + // remaining instructions. + while (!RedoInsts.empty()) { + Instruction *I = RedoInsts.pop_back_val(); + if (isInstructionTriviallyDead(I)) + EraseInst(I); + else + OptimizeInst(I); } } // We are done with the rank map. RankMap.clear(); ValueRankMap.clear(); if (MadeChange) { // FIXME: This should also 'preserve the CFG'. auto PA = PreservedAnalyses(); PA.preserve(); return PA; } return PreservedAnalyses::all(); } namespace { class ReassociateLegacyPass : public FunctionPass { ReassociatePass Impl; public: static char ID; // Pass identification, replacement for typeid ReassociateLegacyPass() : FunctionPass(ID) { initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; FunctionAnalysisManager DummyFAM; auto PA = Impl.run(F, DummyFAM); return !PA.areAllPreserved(); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addPreserved(); } }; } char ReassociateLegacyPass::ID = 0; INITIALIZE_PASS(ReassociateLegacyPass, "reassociate", "Reassociate expressions", false, false) // Public interface to the Reassociate pass FunctionPass *llvm::createReassociatePass() { return new ReassociateLegacyPass(); } Index: projects/clang390-import/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- projects/clang390-import/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp (revision 304769) +++ projects/clang390-import/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp (revision 304770) @@ -1,743 +1,749 @@ //===- CloneFunction.cpp - Clone a function into another function ---------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the CloneFunctionInto interface, which is used as the // low-level function cloner. This is used by the CloneFunction and function // inliner to do the dirty work of copying the body of a function around. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include using namespace llvm; /// See comments in Cloning.h. BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix, Function *F, ClonedCodeInfo *CodeInfo) { BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; // Loop over all instructions, and copy them over. for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { Instruction *NewInst = II->clone(); if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); VMap[&*II] = NewInst; // Add instruction map to value. hasCalls |= (isa(II) && !isa(II)); if (const AllocaInst *AI = dyn_cast(II)) { if (isa(AI->getArraySize())) hasStaticAllocas = true; else hasDynamicAllocas = true; } } if (CodeInfo) { CodeInfo->ContainsCalls |= hasCalls; CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->getEntryBlock(); } return NewBB; } // Clone OldFunc into NewFunc, transforming the old arguments into references to // VMap values. // void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { assert(NameSuffix && "NameSuffix cannot be null!"); #ifndef NDEBUG for (const Argument &I : OldFunc->args()) assert(VMap.count(&I) && "No mapping from source argument specified!"); #endif // Copy all attributes other than those stored in the AttributeSet. We need // to remap the parameter indices of the AttributeSet. AttributeSet NewAttrs = NewFunc->getAttributes(); NewFunc->copyAttributesFrom(OldFunc); NewFunc->setAttributes(NewAttrs); // Fix up the personality function that got copied over. if (OldFunc->hasPersonalityFn()) NewFunc->setPersonalityFn( MapValue(OldFunc->getPersonalityFn(), VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer)); AttributeSet OldAttrs = OldFunc->getAttributes(); // Clone any argument attributes that are present in the VMap. for (const Argument &OldArg : OldFunc->args()) if (Argument *NewArg = dyn_cast(VMap[&OldArg])) { AttributeSet attrs = OldAttrs.getParamAttributes(OldArg.getArgNo() + 1); if (attrs.getNumSlots() > 0) NewArg->addAttr(attrs); } NewFunc->setAttributes( NewFunc->getAttributes() .addAttributes(NewFunc->getContext(), AttributeSet::ReturnIndex, OldAttrs.getRetAttributes()) .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex, OldAttrs.getFnAttributes())); SmallVector, 1> MDs; OldFunc->getAllMetadata(MDs); for (auto MD : MDs) NewFunc->addMetadata( MD.first, *MapMetadata(MD.second, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer)); // Loop over all of the basic blocks in the function, cloning them as // appropriate. Note that we save BE this way in order to handle cloning of // recursive functions into themselves. // for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); BI != BE; ++BI) { const BasicBlock &BB = *BI; // Create a new basic block and copy instructions into it! BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo); // Add basic block mapping. VMap[&BB] = CBB; // It is only legal to clone a function if a block address within that // function is never referenced outside of the function. Given that, we // want to map block addresses from the old function to block addresses in // the clone. (This is different from the generic ValueMapper // implementation, which generates an invalid blockaddress when // cloning a function.) if (BB.hasAddressTaken()) { Constant *OldBBAddr = BlockAddress::get(const_cast(OldFunc), const_cast(&BB)); VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); } // Note return instructions for the caller. if (ReturnInst *RI = dyn_cast(CBB->getTerminator())) Returns.push_back(RI); } // Loop over all of the instructions in the function, fixing up operand // references as we go. This uses VMap to do all the hard work. for (Function::iterator BB = cast(VMap[&OldFunc->front()])->getIterator(), BE = NewFunc->end(); BB != BE; ++BB) // Loop over all instructions, fixing each one as we find it... for (Instruction &II : *BB) RemapInstruction(&II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); } /// Return a copy of the specified function and add it to that function's /// module. Also, any references specified in the VMap are changed to refer to /// their mapped value instead of the original one. If any of the arguments to /// the function are in the VMap, the arguments are deleted from the resultant /// function. The VMap is updated to include mappings from all of the /// instructions and basicblocks in the function from their old to new values. /// Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo) { std::vector ArgTypes; // The user might be deleting arguments to the function by specifying them in // the VMap. If so, we need to not add the arguments to the arg ty vector // for (const Argument &I : F->args()) if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet? ArgTypes.push_back(I.getType()); // Create a new function type... FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), ArgTypes, F->getFunctionType()->isVarArg()); // Create the new function... Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName(), F->getParent()); // Loop over the arguments, copying the names of the mapped arguments over... Function::arg_iterator DestI = NewF->arg_begin(); for (const Argument & I : F->args()) if (VMap.count(&I) == 0) { // Is this argument preserved? DestI->setName(I.getName()); // Copy the name over... VMap[&I] = &*DestI++; // Add mapping to VMap } SmallVector Returns; // Ignore returns cloned. CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns, "", CodeInfo); return NewF; } namespace { /// This is a private class used to implement CloneAndPruneFunctionInto. struct PruningFunctionCloner { Function *NewFunc; const Function *OldFunc; ValueToValueMapTy &VMap; bool ModuleLevelChanges; const char *NameSuffix; ClonedCodeInfo *CodeInfo; public: PruningFunctionCloner(Function *newFunc, const Function *oldFunc, ValueToValueMapTy &valueMap, bool moduleLevelChanges, const char *nameSuffix, ClonedCodeInfo *codeInfo) : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix), CodeInfo(codeInfo) {} /// The specified block is found to be reachable, clone it and /// anything that it can reach. void CloneBlock(const BasicBlock *BB, BasicBlock::const_iterator StartingInst, std::vector &ToClone); }; } /// The specified block is found to be reachable, clone it and /// anything that it can reach. void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, BasicBlock::const_iterator StartingInst, std::vector &ToClone){ WeakVH &BBEntry = VMap[BB]; // Have we already cloned this block? if (BBEntry) return; // Nope, clone it now. BasicBlock *NewBB; BBEntry = NewBB = BasicBlock::Create(BB->getContext()); if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); // It is only legal to clone a function if a block address within that // function is never referenced outside of the function. Given that, we // want to map block addresses from the old function to block addresses in // the clone. (This is different from the generic ValueMapper // implementation, which generates an invalid blockaddress when // cloning a function.) // // Note that we don't need to fix the mapping for unreachable blocks; // the default mapping there is safe. if (BB->hasAddressTaken()) { Constant *OldBBAddr = BlockAddress::get(const_cast(OldFunc), const_cast(BB)); VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB); } bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; // Loop over all instructions, and copy them over, DCE'ing as we go. This // loop doesn't include the terminator. for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE; ++II) { Instruction *NewInst = II->clone(); // Eagerly remap operands to the newly cloned instruction, except for PHI // nodes for which we defer processing until we update the CFG. if (!isa(NewInst)) { RemapInstruction(NewInst, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); // If we can simplify this instruction to some other value, simply add // a mapping to that value rather than inserting a new instruction into // the basic block. if (Value *V = SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) { // On the off-chance that this simplifies to an instruction in the old // function, map it back into the new function. if (Value *MappedV = VMap.lookup(V)) V = MappedV; if (!NewInst->mayHaveSideEffects()) { VMap[&*II] = V; delete NewInst; continue; } } } if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); VMap[&*II] = NewInst; // Add instruction map to value. NewBB->getInstList().push_back(NewInst); hasCalls |= (isa(II) && !isa(II)); if (CodeInfo) if (auto CS = ImmutableCallSite(&*II)) if (CS.hasOperandBundles()) CodeInfo->OperandBundleCallSites.push_back(NewInst); if (const AllocaInst *AI = dyn_cast(II)) { if (isa(AI->getArraySize())) hasStaticAllocas = true; else hasDynamicAllocas = true; } } // Finally, clone over the terminator. const TerminatorInst *OldTI = BB->getTerminator(); bool TerminatorDone = false; if (const BranchInst *BI = dyn_cast(OldTI)) { if (BI->isConditional()) { // If the condition was a known constant in the callee... ConstantInt *Cond = dyn_cast(BI->getCondition()); // Or is a known constant in the caller... if (!Cond) { Value *V = VMap.lookup(BI->getCondition()); Cond = dyn_cast_or_null(V); } // Constant fold to uncond branch! if (Cond) { BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue()); VMap[OldTI] = BranchInst::Create(Dest, NewBB); ToClone.push_back(Dest); TerminatorDone = true; } } } else if (const SwitchInst *SI = dyn_cast(OldTI)) { // If switching on a value known constant in the caller. ConstantInt *Cond = dyn_cast(SI->getCondition()); if (!Cond) { // Or known constant after constant prop in the callee... Value *V = VMap.lookup(SI->getCondition()); Cond = dyn_cast_or_null(V); } if (Cond) { // Constant fold to uncond branch! SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond); BasicBlock *Dest = const_cast(Case.getCaseSuccessor()); VMap[OldTI] = BranchInst::Create(Dest, NewBB); ToClone.push_back(Dest); TerminatorDone = true; } } if (!TerminatorDone) { Instruction *NewInst = OldTI->clone(); if (OldTI->hasName()) NewInst->setName(OldTI->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); VMap[OldTI] = NewInst; // Add instruction map to value. if (CodeInfo) if (auto CS = ImmutableCallSite(OldTI)) if (CS.hasOperandBundles()) CodeInfo->OperandBundleCallSites.push_back(NewInst); // Recursively clone any reachable successor blocks. const TerminatorInst *TI = BB->getTerminator(); for (const BasicBlock *Succ : TI->successors()) ToClone.push_back(Succ); } if (CodeInfo) { CodeInfo->ContainsCalls |= hasCalls; CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->front(); } } /// This works like CloneAndPruneFunctionInto, except that it does not clone the /// entire function. Instead it starts at an instruction provided by the caller /// and copies (and prunes) only the code reachable from that instruction. void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo) { assert(NameSuffix && "NameSuffix cannot be null!"); ValueMapTypeRemapper *TypeMapper = nullptr; ValueMaterializer *Materializer = nullptr; #ifndef NDEBUG // If the cloning starts at the beginning of the function, verify that // the function arguments are mapped. if (!StartingInst) for (const Argument &II : OldFunc->args()) assert(VMap.count(&II) && "No mapping from source argument specified!"); #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, NameSuffix, CodeInfo); const BasicBlock *StartingBB; if (StartingInst) StartingBB = StartingInst->getParent(); else { StartingBB = &OldFunc->getEntryBlock(); StartingInst = &StartingBB->front(); } // Clone the entry block, and anything recursively reachable from it. std::vector CloneWorklist; PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); PFC.CloneBlock(BB, BB->begin(), CloneWorklist); } // Loop over all of the basic blocks in the old function. If the block was // reachable, we have cloned it and the old block is now in the value map: // insert it into the new function in the right order. If not, ignore it. // // Defer PHI resolution until rest of function is resolved. SmallVector PHIToResolve; for (const BasicBlock &BI : *OldFunc) { Value *V = VMap.lookup(&BI); BasicBlock *NewBB = cast_or_null(V); if (!NewBB) continue; // Dead block. // Add the new block to the new function. NewFunc->getBasicBlockList().push_back(NewBB); // Handle PHI nodes specially, as we have to remove references to dead // blocks. for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) { // PHI nodes may have been remapped to non-PHI nodes by the caller or // during the cloning process. if (const PHINode *PN = dyn_cast(I)) { if (isa(VMap[PN])) PHIToResolve.push_back(PN); else break; } else { break; } } // Finally, remap the terminator instructions, as those can't be remapped // until all BBs are mapped. RemapInstruction(NewBB->getTerminator(), VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); } // Defer PHI resolution until rest of function is resolved, PHI resolution // requires the CFG to be up-to-date. for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) { const PHINode *OPN = PHIToResolve[phino]; unsigned NumPreds = OPN->getNumIncomingValues(); const BasicBlock *OldBB = OPN->getParent(); BasicBlock *NewBB = cast(VMap[OldBB]); // Map operands for blocks that are live and remove operands for blocks // that are dead. for (; phino != PHIToResolve.size() && PHIToResolve[phino]->getParent() == OldBB; ++phino) { OPN = PHIToResolve[phino]; PHINode *PN = cast(VMap[OPN]); for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) { Value *V = VMap.lookup(PN->getIncomingBlock(pred)); if (BasicBlock *MappedBlock = cast_or_null(V)) { Value *InVal = MapValue(PN->getIncomingValue(pred), VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); assert(InVal && "Unknown input value?"); PN->setIncomingValue(pred, InVal); PN->setIncomingBlock(pred, MappedBlock); } else { PN->removeIncomingValue(pred, false); --pred; // Revisit the next entry. --e; } } } // The loop above has removed PHI entries for those blocks that are dead // and has updated others. However, if a block is live (i.e. copied over) // but its terminator has been changed to not go to this block, then our // phi nodes will have invalid entries. Update the PHI nodes in this // case. PHINode *PN = cast(NewBB->begin()); NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB)); if (NumPreds != PN->getNumIncomingValues()) { assert(NumPreds < PN->getNumIncomingValues()); // Count how many times each predecessor comes to this block. std::map PredCount; for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB); PI != E; ++PI) --PredCount[*PI]; // Figure out how many entries to remove from each PHI. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) ++PredCount[PN->getIncomingBlock(i)]; // At this point, the excess predecessor entries are positive in the // map. Loop over all of the PHIs and remove excess predecessor // entries. BasicBlock::iterator I = NewBB->begin(); for (; (PN = dyn_cast(I)); ++I) { for (const auto &PCI : PredCount) { BasicBlock *Pred = PCI.first; for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove) PN->removeIncomingValue(Pred, false); } } } // If the loops above have made these phi nodes have 0 or 1 operand, // replace them with undef or the input value. We must do this for // correctness, because 0-operand phis are not valid. PN = cast(NewBB->begin()); if (PN->getNumIncomingValues() == 0) { BasicBlock::iterator I = NewBB->begin(); BasicBlock::const_iterator OldI = OldBB->begin(); while ((PN = dyn_cast(I++))) { Value *NV = UndefValue::get(PN->getType()); PN->replaceAllUsesWith(NV); assert(VMap[&*OldI] == PN && "VMap mismatch"); VMap[&*OldI] = NV; PN->eraseFromParent(); ++OldI; } } } // Make a second pass over the PHINodes now that all of them have been // remapped into the new function, simplifying the PHINode and performing any // recursive simplifications exposed. This will transparently update the // WeakVH in the VMap. Notably, we rely on that so that if we coalesce // two PHINodes, the iteration over the old PHIs remains valid, and the // mapping will just map us to the new node (which may not even be a PHI // node). const DataLayout &DL = NewFunc->getParent()->getDataLayout(); SmallSetVector Worklist; for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) if (isa(VMap[PHIToResolve[Idx]])) Worklist.insert(PHIToResolve[Idx]); // Note that we must test the size on each iteration, the worklist can grow. for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { const Value *OrigV = Worklist[Idx]; auto *I = dyn_cast_or_null(VMap.lookup(OrigV)); if (!I) continue; + // Skip over non-intrinsic callsites, we don't want to remove any nodes from + // the CGSCC. + CallSite CS = CallSite(I); + if (CS && CS.getCalledFunction() && !CS.getCalledFunction()->isIntrinsic()) + continue; + // See if this instruction simplifies. Value *SimpleV = SimplifyInstruction(I, DL); if (!SimpleV) continue; // Stash away all the uses of the old instruction so we can check them for // recursive simplifications after a RAUW. This is cheaper than checking all // uses of To on the recursive step in most cases. for (const User *U : OrigV->users()) Worklist.insert(cast(U)); // Replace the instruction with its simplified value. I->replaceAllUsesWith(SimpleV); // If the original instruction had no side effects, remove it. if (isInstructionTriviallyDead(I)) I->eraseFromParent(); else VMap[OrigV] = I; } // Now that the inlined function body has been fully constructed, go through // and zap unconditional fall-through branches. This happens all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. Function::iterator Begin = cast(VMap[StartingBB])->getIterator(); Function::iterator I = Begin; while (I != NewFunc->end()) { // Check if this block has become dead during inlining or other // simplifications. Note that the first block will appear dead, as it has // not yet been wired up properly. if (I != Begin && (pred_begin(&*I) == pred_end(&*I) || I->getSinglePredecessor() == &*I)) { BasicBlock *DeadBB = &*I++; DeleteDeadBlock(DeadBB); continue; } // We need to simplify conditional branches and switches with a constant // operand. We try to prune these out when cloning, but if the // simplification required looking through PHI nodes, those are only // available after forming the full basic block. That may leave some here, // and we still want to prune the dead code as early as possible. ConstantFoldTerminator(&*I); BranchInst *BI = dyn_cast(I->getTerminator()); if (!BI || BI->isConditional()) { ++I; continue; } BasicBlock *Dest = BI->getSuccessor(0); if (!Dest->getSinglePredecessor()) { ++I; continue; } // We shouldn't be able to get single-entry PHI nodes here, as instsimplify // above should have zapped all of them.. assert(!isa(Dest->begin())); // We know all single-entry PHI nodes in the inlined function have been // removed, so we just need to splice the blocks. BI->eraseFromParent(); // Make all PHI nodes that referred to Dest now refer to I as their source. Dest->replaceAllUsesWith(&*I); // Move all the instructions in the succ to the pred. I->getInstList().splice(I->end(), Dest->getInstList()); // Remove the dest block. Dest->eraseFromParent(); // Do not increment I, iteratively merge all things this block branches to. } // Make a final pass over the basic blocks from the old function to gather // any return instructions which survived folding. We have to do this here // because we can iteratively remove and merge returns above. for (Function::iterator I = cast(VMap[StartingBB])->getIterator(), E = NewFunc->end(); I != E; ++I) if (ReturnInst *RI = dyn_cast(I->getTerminator())) Returns.push_back(RI); } /// This works exactly like CloneFunctionInto, /// except that it does some simple constant prop and DCE on the fly. The /// effect of this is to copy significantly less code in cases where (for /// example) a function call with constant arguments is inlined, and those /// constant arguments cause a significant amount of code in the callee to be /// dead. Since this doesn't produce an exact copy of the input, it can't be /// used for things like CloneFunction or CloneModule. void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall) { CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, ModuleLevelChanges, Returns, NameSuffix, CodeInfo); } /// \brief Remaps instructions in \p Blocks using the mapping in \p VMap. void llvm::remapInstructionsInBlocks( const SmallVectorImpl &Blocks, ValueToValueMapTy &VMap) { // Rewrite the code to refer to itself. for (auto *BB : Blocks) for (auto &Inst : *BB) RemapInstruction(&Inst, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); } /// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p /// Blocks. /// /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block /// \p LoopDomBB. Insert the new blocks before block specified in \p Before. Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, Loop *OrigLoop, ValueToValueMapTy &VMap, const Twine &NameSuffix, LoopInfo *LI, DominatorTree *DT, SmallVectorImpl &Blocks) { assert(OrigLoop->getSubLoops().empty() && "Loop to be cloned cannot have inner loop"); Function *F = OrigLoop->getHeader()->getParent(); Loop *ParentLoop = OrigLoop->getParentLoop(); Loop *NewLoop = new Loop(); if (ParentLoop) ParentLoop->addChildLoop(NewLoop); else LI->addTopLevelLoop(NewLoop); BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); assert(OrigPH && "No preheader"); BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); // To rename the loop PHIs. VMap[OrigPH] = NewPH; Blocks.push_back(NewPH); // Update LoopInfo. if (ParentLoop) ParentLoop->addBasicBlockToLoop(NewPH, *LI); // Update DominatorTree. DT->addNewBlock(NewPH, LoopDomBB); for (BasicBlock *BB : OrigLoop->getBlocks()) { BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); VMap[BB] = NewBB; // Update LoopInfo. NewLoop->addBasicBlockToLoop(NewBB, *LI); // Add DominatorTree node. After seeing all blocks, update to correct IDom. DT->addNewBlock(NewBB, NewPH); Blocks.push_back(NewBB); } for (BasicBlock *BB : OrigLoop->getBlocks()) { // Update DominatorTree. BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); DT->changeImmediateDominator(cast(VMap[BB]), cast(VMap[IDomBB])); } // Move them physically from the end of the block list. F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), NewPH); F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), NewLoop->getHeader()->getIterator(), F->end()); return NewLoop; } Index: projects/clang390-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- projects/clang390-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 304769) +++ projects/clang390-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 304770) @@ -1,4650 +1,4713 @@ //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // This pass implements the Bottom Up SLP vectorizer. It detects consecutive // stores that can be put together into vector-stores. Next, it attempts to // construct vectorizable tree using the use-def chains. If a profitable tree // was found, the SLP vectorizer performs vectorization on the tree. // // The pass is inspired by the work described in the paper: // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize.h" #include #include using namespace llvm; using namespace slpvectorizer; #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); static cl::opt SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number ")); static cl::opt ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); /// Limits the size of scheduling regions in a block. /// It avoid long compile times for _very_ large blocks where vector /// instructions are spread over a wide range. /// This limit is way higher than needed by real-world functions. static cl::opt ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block")); static cl::opt MinVectorRegSizeOption( "slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); -// FIXME: Set this via cl::opt to allow overriding. -static const unsigned RecursionMaxDepth = 12; +static cl::opt RecursionMaxDepth( + "slp-recursion-max-depth", cl::init(12), cl::Hidden, + cl::desc("Limit the recursion depth when building a vectorizable tree")); +static cl::opt MinTreeSize( + "slp-min-tree-size", cl::init(3), cl::Hidden, + cl::desc("Only vectorize small trees if they are fully vectorizable")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; // Another limit for the alias checks: The maximum distance between load/store // instructions where alias checks are done. // This limit is useful for very large basic blocks. static const unsigned MaxMemDepDistance = 160; /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling /// regions to be handled. static const int MinScheduleRegionSize = 16; /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM /// vectors. We also filter target specific types which have absolutely no /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just /// avoids spending time checking the cost model and realizing that they will /// be inevitably scalarized. static bool isValidElementType(Type *Ty) { return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty(); } /// \returns the parent basic block if all of the instructions in \p VL /// are in the same block or null otherwise. static BasicBlock *getSameBlock(ArrayRef VL) { Instruction *I0 = dyn_cast(VL[0]); if (!I0) return nullptr; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); if (!I) return nullptr; if (BB != I->getParent()) return nullptr; } return BB; } /// \returns True if all of the values in \p VL are constants. static bool allConstant(ArrayRef VL) { for (Value *i : VL) if (!isa(i)) return false; return true; } /// \returns True if all of the values in \p VL are identical. static bool isSplat(ArrayRef VL) { for (unsigned i = 1, e = VL.size(); i < e; ++i) if (VL[i] != VL[0]) return false; return true; } ///\returns Opcode that can be clubbed with \p Op to create an alternate /// sequence which can later be merged as a ShuffleVector instruction. static unsigned getAltOpcode(unsigned Op) { switch (Op) { case Instruction::FAdd: return Instruction::FSub; case Instruction::FSub: return Instruction::FAdd; case Instruction::Add: return Instruction::Sub; case Instruction::Sub: return Instruction::Add; default: return 0; } } ///\returns bool representing if Opcode \p Op can be part /// of an alternate sequence which can later be merged as /// a ShuffleVector instruction. static bool canCombineAsAltInst(unsigned Op) { return Op == Instruction::FAdd || Op == Instruction::FSub || Op == Instruction::Sub || Op == Instruction::Add; } /// \returns ShuffleVector instruction if instructions in \p VL have /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) static unsigned isAltInst(ArrayRef VL) { Instruction *I0 = dyn_cast(VL[0]); unsigned Opcode = I0->getOpcode(); unsigned AltOpcode = getAltOpcode(Opcode); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) return 0; } return Instruction::ShuffleVector; } /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef VL) { Instruction *I0 = dyn_cast(VL[0]); if (!I0) return 0; unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); if (!I || Opcode != I->getOpcode()) { if (canCombineAsAltInst(Opcode) && i == 1) return isAltInst(VL); return 0; } } return Opcode; } /// Get the intersection (logical and) of all of the potential IR flags /// of each scalar operation (VL) that will be converted into a vector (I). /// Flag set: NSW, NUW, exact, and all of fast-math. static void propagateIRFlags(Value *I, ArrayRef VL) { if (auto *VecOp = dyn_cast(I)) { if (auto *Intersection = dyn_cast(VL[0])) { // Intersection is initialized to the 0th scalar, // so start counting from index '1'. for (int i = 1, e = VL.size(); i < e; ++i) { if (auto *Scalar = dyn_cast(VL[i])) Intersection->andIRFlags(Scalar); } VecOp->copyIRFlags(Intersection); } } } /// \returns The type that all of the values in \p VL have or null if there /// are different types. static Type* getSameType(ArrayRef VL) { Type *Ty = VL[0]->getType(); for (int i = 1, e = VL.size(); i < e; i++) if (VL[i]->getType() != Ty) return nullptr; return Ty; } /// \returns True if Extract{Value,Element} instruction extracts element Idx. static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { assert(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue); if (Opcode == Instruction::ExtractElement) { ConstantInt *CI = dyn_cast(E->getOperand(1)); return CI && CI->getZExtValue() == Idx; } else { ExtractValueInst *EI = cast(E); return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx; } } /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI) { unsigned Opcode = UserInst->getOpcode(); switch (Opcode) { case Instruction::Load: { LoadInst *LI = cast(UserInst); return (LI->getPointerOperand() == Scalar); } case Instruction::Store: { StoreInst *SI = cast(UserInst); return (SI->getPointerOperand() == Scalar); } case Instruction::Call: { CallInst *CI = cast(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (hasVectorInstrinsicScalarOpd(ID, 1)) { return (CI->getArgOperand(1) == Scalar); } } default: return false; } } /// \returns the AA location that is being access by the instruction. static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { if (StoreInst *SI = dyn_cast(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast(I)) return MemoryLocation::get(LI); return MemoryLocation(); } /// \returns True if the instruction is not a volatile or atomic load/store. static bool isSimple(Instruction *I) { if (LoadInst *LI = dyn_cast(I)) return LI->isSimple(); if (StoreInst *SI = dyn_cast(I)) return SI->isSimple(); if (MemIntrinsic *MI = dyn_cast(I)) return !MI->isVolatile(); return true; } namespace llvm { namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { public: typedef SmallVector ValueList; typedef SmallVector InstrList; typedef SmallPtrSet ValueSet; typedef SmallVector StoreList; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), DL(DL), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. // TODO: It would be better to limit the vectorization factor based on // data type rather than just register size. For example, x86 AVX has // 256-bit registers, but it does not support integer operations // at that width (that requires AVX2). if (MaxVectorRegSizeOption.getNumOccurrences()) MaxVecRegSize = MaxVectorRegSizeOption; else MaxVecRegSize = TTI->getRegisterBitWidth(true); MinVecRegSize = MinVectorRegSizeOption; } /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. int getSpillCost(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. int getTreeCost(); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); NumLoadsWantToKeepOrder = 0; NumLoadsWantToChangeOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); } MinBWs.clear(); } /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded /// value reaching V. This method is used by the vectorizer to calculate /// vectorization factors. unsigned getVectorElementSize(Value *V); /// Compute the minimum type sizes required to represent the entries in a /// vectorizable tree. void computeMinimumValueSizes(); // \returns maximum vector register size as set by TTI or overridden by cl::opt. unsigned getMaxVecRegSize() const { return MaxVecRegSize; } // \returns minimum vector register size as set by cl::opt. unsigned getMinVecRegSize() const { return MinVecRegSize; } /// \brief Check if ArrayType or StructType is isomorphic to some VectorType. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; private: struct TreeEntry; /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth); /// \returns True if the ExtractElement/ExtractValue instructions in VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). bool canReuseExtract(ArrayRef VL, unsigned Opcode) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef VL); /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. Value *alreadyVectorized(ArrayRef VL) const; /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getGatherCost(Type *Ty); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. int getGatherCost(ArrayRef VL); /// \brief Set the Builder insert point to one after the last instruction in /// the bundle void setInsertPointAfterBundle(ArrayRef VL); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef VL, VectorType *Ty); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. void reorderAltShuffleOperands(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. void reorderInputsAccordingToOpcode(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), NeedToGather(0) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { assert(VL.size() == Scalars.size() && "Invalid size"); return std::equal(VL.begin(), VL.end(), Scalars.begin()); } /// A vector of scalars. ValueList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue; /// Do we need to gather this sequence ? bool NeedToGather; }; /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); ScalarToTreeEntry[VL[i]] = idx; } } else { MustGather.insert(VL.begin(), VL.end()); } return Last; } /// -- Vectorization State -- /// Holds all of the tree entries. std::vector VectorizableTree; /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser (Value *S, llvm::User *U, int L) : Scalar(S), User(U), Lane(L){} // Which scalar in our function. Value *Scalar; // Which user that uses the scalar. llvm::User *User; // Which lane does the scalar belong to. int Lane; }; typedef SmallVector UserList; /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it /// is invariant in the calling loop. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { // First check if the result is already in the cache. AliasCacheKey key = std::make_pair(Inst1, Inst2); Optional &result = AliasCache[key]; if (result.hasValue()) { return result.getValue(); } MemoryLocation Loc2 = getLocation(Inst2, AA); bool aliased = true; if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { // Do the alias check. aliased = AA->alias(Loc1, Loc2); } // Store the result in the cache. result = aliased; return aliased; } typedef std::pair AliasCacheKey; /// Cache for alias results. /// TODO: consider moving this to the AliasAnalysis itself. DenseMap> AliasCache; /// Removes an instruction from its block and eventually deletes it. /// It's like Instruction::eraseFromParent() except that the actual deletion /// is delayed until BoUpSLP is destructed. /// This is required to ensure that there are no incorrect collisions in the /// AliasCache, which can happen if a new instruction is allocated at the /// same address as a previously deleted instruction. void eraseInstruction(Instruction *I) { I->removeFromParent(); I->dropAllReferences(); DeletedInstructions.push_back(std::unique_ptr(I)); } /// Temporary store for deleted instructions. Instructions will be deleted /// eventually when the BoUpSLP is destructed. SmallVector, 8> DeletedInstructions; /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). UserList ExternalUses; /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; /// Holds all of the instructions that we gathered. SetVector GatherSeq; /// A list of blocks that we are going to CSE. SetVector CSEBlocks; /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a /// vector instruction). struct ScheduleData { // The initial value for the dependency counters. It means that the // dependencies are not calculated yet. enum { InvalidDeps = -1 }; ScheduleData() : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} void init(int BlockSchedulingRegionID) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; IsScheduled = false; SchedulingRegionID = BlockSchedulingRegionID; UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); } /// Returns true if the dependency information has been calculated. bool hasValidDependencies() const { return Dependencies != InvalidDeps; } /// Returns true for single instructions and for bundle representatives /// (= the head of a bundle). bool isSchedulingEntity() const { return FirstInBundle == this; } /// Returns true if it represents an instruction bundle and not only a /// single instruction. bool isPartOfBundle() const { return NextInBundle != nullptr || FirstInBundle != this; } /// Returns true if it is ready for scheduling, i.e. it has no more /// unscheduled depending instructions/bundles. bool isReady() const { assert(isSchedulingEntity() && "can't consider non-scheduling entity for ready list"); return UnscheduledDepsInBundle == 0 && !IsScheduled; } /// Modifies the number of unscheduled dependencies, also updating it for /// the whole bundle. int incrementUnscheduledDeps(int Incr) { UnscheduledDeps += Incr; return FirstInBundle->UnscheduledDepsInBundle += Incr; } /// Sets the number of unscheduled dependencies to the number of /// dependencies. void resetUnscheduledDeps() { incrementUnscheduledDeps(Dependencies - UnscheduledDeps); } /// Clears all dependency information. void clearDependencies() { Dependencies = InvalidDeps; resetUnscheduledDeps(); MemoryDependencies.clear(); } void dump(raw_ostream &os) const { if (!isSchedulingEntity()) { os << "/ " << *Inst; } else if (NextInBundle) { os << '[' << *Inst; ScheduleData *SD = NextInBundle; while (SD) { os << ';' << *SD->Inst; SD = SD->NextInBundle; } os << ']'; } else { os << *Inst; } } Instruction *Inst; /// Points to the head in an instruction bundle (and always to this for /// single instructions). ScheduleData *FirstInBundle; /// Single linked list of all instructions in a bundle. Null if it is a /// single instruction. ScheduleData *NextInBundle; /// Single linked list of all memory instructions (e.g. load, store, call) /// in the block - until the end of the scheduling region. ScheduleData *NextLoadStore; /// The dependent memory instructions. /// This list is derived on demand in calculateDependencies(). SmallVector MemoryDependencies; /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. int SchedulingRegionID; /// Used for getting a "good" final ordering of instructions. int SchedulingPriority; /// The number of dependencies. Constitutes of the number of users of the /// instruction plus the number of dependent memory instructions (if any). /// This value is calculated on demand. /// If InvalidDeps, the number of dependencies is not calculated yet. /// int Dependencies; /// The number of dependencies minus the number of dependencies of scheduled /// instructions. As soon as this is zero, the instruction/bundle gets ready /// for scheduling. /// Note that this is negative as long as Dependencies is not calculated. int UnscheduledDeps; /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for /// single instructions. int UnscheduledDepsInBundle; /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled; }; #ifndef NDEBUG friend inline raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { SD.dump(os); return os; } #endif /// Contains all scheduling data for a basic block. /// struct BlockScheduling { BlockScheduling(BasicBlock *BB) : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), ScheduleStart(nullptr), ScheduleEnd(nullptr), FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), ScheduleRegionSize(0), ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), // Make sure that the initial SchedulingRegionID is greater than the // initial SchedulingRegionID in ScheduleData (which is 0). SchedulingRegionID(1) {} void clear() { ReadyInsts.clear(); ScheduleStart = nullptr; ScheduleEnd = nullptr; FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; // Reduce the maximum schedule region size by the size of the // previous scheduling run. ScheduleRegionSizeLimit -= ScheduleRegionSize; if (ScheduleRegionSizeLimit < MinScheduleRegionSize) ScheduleRegionSizeLimit = MinScheduleRegionSize; ScheduleRegionSize = 0; // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; } ScheduleData *getScheduleData(Value *V) { ScheduleData *SD = ScheduleDataMap[V]; if (SD && SD->SchedulingRegionID == SchedulingRegionID) return SD; return nullptr; } bool isInSchedulingRegion(ScheduleData *SD) { return SD->SchedulingRegionID == SchedulingRegionID; } /// Marks an instruction as scheduled and puts all dependent ready /// instructions into the ready-list. template void schedule(ScheduleData *SD, ReadyListType &ReadyList) { SD->IsScheduled = true; DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); ScheduleData *BundleMember = SD; while (BundleMember) { // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { ScheduleData *OpDef = getScheduleData(U.get()); if (OpDef && OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. ScheduleData *DepBundle = OpDef->FirstInBundle; assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); } } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } BundleMember = BundleMember->NextInBundle; } } /// Put all instructions into the ReadyList which are ready for scheduling. template void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { ScheduleData *SD = getScheduleData(I); if (SD->isSchedulingEntity() && SD->isReady()) { ReadyList.insert(SD); DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); } } } /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL); /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. bool extendSchedulingRegion(Value *V); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. void initScheduleData(Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore, ScheduleData *NextLoadStore); /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, BoUpSLP *SLP); /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); BasicBlock *BB; /// Simple memory allocation for ScheduleData. std::vector> ScheduleDataChunks; /// The size of a ScheduleData array in ScheduleDataChunks. int ChunkSize; /// The allocator position in the current chunk, which is the last entry /// of ScheduleDataChunks. int ChunkPos; /// Attaches ScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. /// ScheduleData structures are recycled. DenseMap ScheduleDataMap; struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } }; /// The ready-list for scheduling (only used for the dry-run). ReadyList ReadyInsts; /// The first instruction of the scheduling region. Instruction *ScheduleStart; /// The first instruction _after_ the scheduling region. Instruction *ScheduleEnd; /// The first memory accessing instruction in the scheduling region /// (can be null). ScheduleData *FirstLoadStoreInRegion; /// The last memory accessing instruction in the scheduling region /// (can be null). ScheduleData *LastLoadStoreInRegion; /// The current size of the scheduling region. int ScheduleRegionSize; /// The maximum size allowed for the scheduling region. int ScheduleRegionSizeLimit; /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. int SchedulingRegionID; }; /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; // Number of load-bundles, which contain consecutive loads. int NumLoadsWantToKeepOrder; // Number of load-bundles of size 2, which are consecutive loads if reversed. int NumLoadsWantToChangeOrder; // Analysis and block reference. Function *F; ScalarEvolution *SE; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; /// A map of scalar integer values to the smallest bit width with which they /// can legally be represented. MapVector MinBWs; }; } // end namespace llvm } // end namespace slpvectorizer void BoUpSLP::buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { TreeEntry *Entry = &EIdx; // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; // No need to handle users of gathered values. if (Entry->NeedToGather) continue; for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); Instruction *UserInst = dyn_cast(U); if (!UserInst) continue; // Skip in-tree scalars that become vectors if (ScalarToTreeEntry.count(U)) { int Idx = ScalarToTreeEntry[U]; TreeEntry *UseEntry = &VectorizableTree[Idx]; Value *UseScalar = UseEntry->Scalars[0]; // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. if (UseScalar != U || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); continue; } } // Ignore users in the user ignore list. if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != UserIgnoreList.end()) continue; DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << Lane << " from " << *Scalar << ".\n"); ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); } } } } void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy; bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); newTreeEntry(VL, false); return; } // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); newTreeEntry(VL, false); return; } if (StoreInst *SI = dyn_cast(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); newTreeEntry(VL, false); return; } unsigned Opcode = getSameOpcode(VL); // Check that this shuffle vector refers to the alternate // sequence of opcodes. if (Opcode == Instruction::ShuffleVector) { Instruction *I0 = dyn_cast(VL[0]); unsigned Op = I0->getOpcode(); if (Op != Instruction::ShuffleVector) isAltShuffle = true; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; } // We now know that this is a vector of instructions of the same type from // the same block. // Don't vectorize ephemeral values. for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); newTreeEntry(VL, false); return; } } // Check if this is a duplicate of another entry. if (ScalarToTreeEntry.count(VL[0])) { int Idx = ScalarToTreeEntry[VL[0]]; TreeEntry *E = &VectorizableTree[Idx]; for (unsigned i = 0, e = VL.size(); i != e; ++i) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); newTreeEntry(VL, false); return; } } DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); return; } // Check that none of the instructions in the bundle are already in the tree. for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (ScalarToTreeEntry.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); newTreeEntry(VL, false); return; } } // If any of the scalars is marked as a value that needs to stay scalar then // we need to gather the scalars. for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false); return; } } // Check that all of the users of the scalars that we want to vectorize are // schedulable. Instruction *VL0 = cast(VL[0]); BasicBlock *BB = cast(VL0)->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); newTreeEntry(VL, false); return; } // Check that every instructions appears once in this bundle. for (unsigned i = 0, e = VL.size(); i < e; ++i) for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, false); return; } auto &BSRef = BlocksSchedules[BB]; if (!BSRef) { BSRef = llvm::make_unique(BB); } BlockScheduling &BS = *BSRef.get(); if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, false); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); switch (Opcode) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { TerminatorInst *Term = dyn_cast( cast(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL); newTreeEntry(VL, false); return; } } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); buildTree_rec(Operands, Depth + 1); } return; } case Instruction::ExtractValue: case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, Opcode); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); } else { BS.cancelScheduling(VL); } newTreeEntry(VL, Reuse); return; } case Instruction::Load: { // Check that a vectorized load would load the same memory as a scalar // load. // For example we don't want vectorize loads that are smaller than 8 bit. // Even though we have a packed struct {} LLVM treats // loading/storing it as an i8 struct. If we vectorize loads/stores from // such a struct we read/write packed bits disagreeing with the // unvectorized version. Type *ScalarTy = VL[0]->getType(); if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } // Check if the loads are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) { ++NumLoadsWantToChangeOrder; } BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); return; } } ++NumLoadsWantToKeepOrder; newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); buildTree_rec(Operands, Depth+1); } return; } case Instruction::ICmp: case Instruction::FCmp: { // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Type *ComparedTy = cast(VL[0])->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast(VL[i]); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); buildTree_rec(Operands, Depth+1); } return; } case Instruction::Select: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); buildTree_rec(Left, Depth + 1); buildTree_rec(Right, Depth + 1); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); buildTree_rec(Operands, Depth+1); } return; } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (unsigned j = 0; j < VL.size(); ++j) { if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL); newTreeEntry(VL, false); return; } } // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = cast(VL0)->getOperand(0)->getType(); for (unsigned j = 0; j < VL.size(); ++j) { Type *CurTy = cast(VL[j])->getOperand(0)->getType(); if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL); newTreeEntry(VL, false); return; } } // We don't combine GEPs with non-constant indexes. for (unsigned j = 0; j < VL.size(); ++j) { auto Op = cast(VL[j])->getOperand(1); if (!isa(Op)) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL); newTreeEntry(VL, false); return; } } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); buildTree_rec(Operands, Depth + 1); } return; } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast(j)->getOperand(0)); buildTree_rec(Operands, Depth + 1); return; } case Instruction::Call: { // Check if the calls are all to the same vectorizable intrinsic. CallInst *CI = cast(VL[0]); // Check if this is an Intrinsic call or something that can be // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } Function *Int = CI->getCalledFunction(); Value *A1I = nullptr; if (hasVectorInstrinsicScalarOpd(ID, 1)) A1I = CI->getArgOperand(1); for (unsigned i = 1, e = VL.size(); i != e; ++i) { CallInst *CI2 = dyn_cast(VL[i]); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; } // ctlz,cttz and powi are special intrinsics whose second argument // should be same in order for them to be vectorized. if (hasVectorInstrinsicScalarOpd(ID, 1)) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); return; } } // Verify that the bundle operands are identical between the two calls. if (CI->hasOperandBundles() && !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } newTreeEntry(VL, true); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) { CallInst *CI2 = dyn_cast(j); Operands.push_back(CI2->getArgOperand(i)); } buildTree_rec(Operands, Depth + 1); } return; } case Instruction::ShuffleVector: { // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. if (!isAltShuffle) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; reorderAltShuffleOperands(VL, Left, Right); buildTree_rec(Left, Depth + 1); buildTree_rec(Right, Depth + 1); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); buildTree_rec(Operands, Depth + 1); } return; } default: BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } } unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N; Type *EltTy; auto *ST = dyn_cast(T); if (ST) { N = ST->getNumElements(); EltTy = *ST->element_begin(); } else { N = cast(T)->getNumElements(); EltTy = cast(T)->getElementType(); } if (!isValidElementType(EltTy)) return 0; uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) return 0; if (ST) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) if (Ty != EltTy) return 0; } return N; } bool BoUpSLP::canReuseExtract(ArrayRef VL, unsigned Opcode) const { assert(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue); assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *VL0 = VL[0]; Instruction *E0 = cast(VL0); Value *Vec = E0->getOperand(0); // We have to extract from a vector/aggregate with the same number of elements. unsigned NElts; if (Opcode == Instruction::ExtractValue) { const DataLayout &DL = E0->getModule()->getDataLayout(); NElts = canMapToVector(Vec->getType(), DL); if (!NElts) return false; // Check if load can be rewritten as load of vector. LoadInst *LI = dyn_cast(Vec); if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) return false; } else { NElts = Vec->getType()->getVectorNumElements(); } if (NElts != VL.size()) return false; // Check that all of the indices extract from the correct offset. if (!matchExtractIndex(E0, 0, Opcode)) return false; for (unsigned i = 1, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); if (!matchExtractIndex(E, i, Opcode)) return false; if (E->getOperand(0) != Vec) return false; } return true; } int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. if (MinBWs.count(VL[0])) VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]), VL.size()); if (E->NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } return getGatherCost(E->Scalars); } unsigned Opcode = getSameOpcode(VL); assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast(VL[0]); switch (Opcode) { case Instruction::PHI: { return 0; } case Instruction::ExtractValue: case Instruction::ExtractElement: { if (canReuseExtract(VL, Opcode)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); if (E->hasOneUse()) // Take credit for instruction that will become dead. DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } return -DeadCost; } return getGatherCost(VecTy); } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), VL0->getType(), SrcTy); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); return VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); return VecCost - ScalarCost; } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { // Certain instructions can be cheaper to vectorize if they have a // constant second vector operand. TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = TargetTransformInfo::OP_None; // If all operands are exactly the same ConstantInt then set the // operand kind to OK_UniformConstantValue. // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. ConstantInt *CInt = nullptr; for (unsigned i = 0; i < VL.size(); ++i) { const Instruction *I = cast(VL[i]); if (!isa(I->getOperand(1))) { Op2VK = TargetTransformInfo::OK_AnyValue; break; } if (i == 0) { CInt = cast(I->getOperand(1)); continue; } if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt != cast(I->getOperand(1))) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } // FIXME: Currently cost of model modification for division by power of // 2 is handled for X86 and AArch64. Add support for other targets. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP); int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); return VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast(VL0)->getAlignment(); int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0); return VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. unsigned alignment = dyn_cast(VL0)->getAlignment(); int ScalarStCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0); return VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. SmallVector ScalarTys, VecTys; for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { ScalarTys.push_back(CI->getArgOperand(op)->getType()); VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), VecTy->getNumElements())); } FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); return VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; int ScalarCost = 0; int VecCost = 0; for (Value *i : VL) { Instruction *I = cast(i); if (!I) break; ScalarCost += TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. Instruction *I0 = cast(VL[0]); VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); Instruction *I1 = cast(VL[1]); VecCost += TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); return VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); } } bool BoUpSLP::isFullyVectorizableTinyTree() { DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of height 2. if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. if (!VectorizableTree[0].NeedToGather && (allConstant(VectorizableTree[1].Scalars) || isSplat(VectorizableTree[1].Scalars))) return true; // Gathering cost would be too much for tiny trees. if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) return false; return true; } int BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). unsigned BundleWidth = VectorizableTree.front().Scalars.size(); int Cost = 0; SmallPtrSet LiveValues; Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { Instruction *Inst = dyn_cast(N.Scalars[0]); if (!Inst) continue; if (!PrevInst) { PrevInst = Inst; continue; } // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { if (isa(&*J) && ScalarToTreeEntry.count(&*J)) LiveValues.insert(cast(&*J)); } DEBUG( dbgs() << "SLP: #LV: " << LiveValues.size(); for (auto *X : LiveValues) dbgs() << " " << X->getName(); dbgs() << ", Looking at "; Inst->dump(); ); // Now find the sequence of instructions between PrevInst and Inst. BasicBlock::reverse_iterator InstIt(Inst->getIterator()), PrevInstIt(PrevInst->getIterator()); --PrevInstIt; while (InstIt != PrevInstIt) { if (PrevInstIt == PrevInst->getParent()->rend()) { PrevInstIt = Inst->getParent()->rbegin(); continue; } if (isa(&*PrevInstIt) && &*PrevInstIt != PrevInst) { SmallVector V; for (auto *II : LiveValues) V.push_back(VectorType::get(II->getType(), BundleWidth)); Cost += TTI->getCostOfKeepingLiveOverCall(V); } ++PrevInstIt; } PrevInst = Inst; } return Cost; } int BoUpSLP::getTreeCost() { int Cost = 0; DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); // We only vectorize tiny trees if it is fully vectorizable. - if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { + if (VectorizableTree.size() < MinTreeSize && !isFullyVectorizableTinyTree()) { if (VectorizableTree.empty()) { assert(!ExternalUses.size() && "We should not have any external users"); } return INT_MAX; } unsigned BundleWidth = VectorizableTree[0].Scalars.size(); for (TreeEntry &TE : VectorizableTree) { int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); Cost += C; } SmallSet ExtractCostCalculated; int ExtractCost = 0; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be // removed prior to code generation, and so the extraction will be // removed as well). if (EphValues.count(EU.User)) continue; // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); auto *ScalarRoot = VectorizableTree[0].Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); VecTy = VectorType::get(MinTy, BundleWidth); ExtractCost += TTI->getExtractWithExtendCost( Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane); } else { ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); } } int SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n" << "SLP: Extract Cost = " << ExtractCost << ".\n" << "SLP: Total Cost = " << Cost << ".\n"); return Cost; } int BoUpSLP::getGatherCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); return Cost; } int BoUpSLP::getGatherCost(ArrayRef VL) { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. return getGatherCost(VecTy); } // Reorder commutative operations in alternate shuffle if the resulting vectors // are consecutive loads. This would allow us to vectorize the tree. // If we have something like- // load a[0] - load b[0] // load b[1] + load a[1] // load a[2] - load b[2] // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right for (Value *i : VL) { Left.push_back(cast(i)->getOperand(0)); Right.push_back(cast(i)->getOperand(1)); } // Reorder if we have a commutative operation and consecutive access // are on either side of the alternate instructions. for (unsigned j = 0; j < VL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j], Right[j]); continue; } else if (VL2->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j + 1], Right[j + 1]); continue; } // else unchanged } } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j], Right[j]); continue; } else if (VL2->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j + 1], Right[j + 1]); continue; } // else unchanged } } } } // Return true if I should be commuted before adding it's left and right // operands to the arrays Left and Right. // // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. static bool shouldReorderOperands(int i, Instruction &I, SmallVectorImpl &Left, SmallVectorImpl &Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight) { Value *VLeft = I.getOperand(0); Value *VRight = I.getOperand(1); // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) // Preserve SplatRight return false; if (VLeft == Right[i - 1]) { // Commuting would preserve SplatRight, but we don't want to break // SplatLeft either, i.e. preserve the original order if possible. // (FIXME: why do we care?) if (SplatLeft && VLeft == Left[i - 1]) return false; return true; } } // Symmetrically handle Right side. if (SplatLeft) { if (VLeft == Left[i - 1]) // Preserve SplatLeft return false; if (VRight == Left[i - 1]) return true; } Instruction *ILeft = dyn_cast(VLeft); Instruction *IRight = dyn_cast(VRight); // If we have "AllSameOpcodeRight", try to see if the left operands preserves // it and not the right, in this case we want to commute. if (AllSameOpcodeRight) { unsigned RightPrevOpcode = cast(Right[i - 1])->getOpcode(); if (IRight && RightPrevOpcode == IRight->getOpcode()) // Do not commute, a match on the right preserves AllSameOpcodeRight return false; if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { // We have a match and may want to commute, but first check if there is // not also a match on the existing operands on the Left to preserve // AllSameOpcodeLeft, i.e. preserve the original order if possible. // (FIXME: why do we care?) if (AllSameOpcodeLeft && ILeft && cast(Left[i - 1])->getOpcode() == ILeft->getOpcode()) return false; return true; } } // Symmetrically handle Left side. if (AllSameOpcodeLeft) { unsigned LeftPrevOpcode = cast(Left[i - 1])->getOpcode(); if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) return false; if (IRight && LeftPrevOpcode == IRight->getOpcode()) return true; } return false; } void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { if (VL.size()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto VLeft = cast(VL[0])->getOperand(0); auto VRight = cast(VL[0])->getOperand(1); if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); Left.push_back(VLeft); Right.push_back(VRight); } // Keep track if we have instructions with all the same opcode on one side. bool AllSameOpcodeLeft = isa(Left[0]); bool AllSameOpcodeRight = isa(Right[0]); // Keep track if we have one side with all the same value (broadcast). bool SplatLeft = true; bool SplatRight = true; for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); assert(I->isCommutative() && "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, AllSameOpcodeRight, SplatLeft, SplatRight)) { Left.push_back(I->getOperand(1)); Right.push_back(I->getOperand(0)); } else { Left.push_back(I->getOperand(0)); Right.push_back(I->getOperand(1)); } // Update Splat* and AllSameOpcode* after the insertion. SplatRight = SplatRight && (Right[i - 1] == Right[i]); SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); AllSameOpcodeLeft = AllSameOpcodeLeft && isa(Left[i]) && (cast(Left[i - 1])->getOpcode() == cast(Left[i])->getOpcode()); AllSameOpcodeRight = AllSameOpcodeRight && isa(Right[i]) && (cast(Right[i - 1])->getOpcode() == cast(Right[i])->getOpcode()); } // If one operand end up being broadcast, return this operand order. if (SplatRight || SplatLeft) return; // Finally check if we can get longer vectorizable chain by reordering // without breaking the good operand order detected above. // E.g. If we have something like- // load a[0] load b[0] // load b[1] load a[1] // load a[2] load b[2] // load a[3] load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize // this code and we still retain AllSameOpcode property. // FIXME: This load reordering might break AllSameOpcode in some rare cases // such as- // add a[0],c[0] load b[0] // add a[1],c[2] load b[1] // b[2] load b[2] // add a[3],c[3] load b[3] for (unsigned j = 0; j < VL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j + 1], Right[j + 1]); continue; } } } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j + 1], Right[j + 1]); continue; } } } // else unchanged } } void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) { - Instruction *VL0 = cast(VL[0]); - BasicBlock::iterator NextInst(VL0); - ++NextInst; - Builder.SetInsertPoint(VL0->getParent(), NextInst); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + // Get the basic block this bundle is in. All instructions in the bundle + // should be in this block. + auto *Front = cast(VL.front()); + auto *BB = Front->getParent(); + assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { + return cast(V)->getParent() == BB; + })); + + // The last instruction in the bundle in program order. + Instruction *LastInst = nullptr; + + // Find the last instruction. The common case should be that BB has been + // scheduled, and the last instruction is VL.back(). So we start with + // VL.back() and iterate over schedule data until we reach the end of the + // bundle. The end of the bundle is marked by null ScheduleData. + if (BlocksSchedules.count(BB)) { + auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back()); + if (Bundle && Bundle->isPartOfBundle()) + for (; Bundle; Bundle = Bundle->NextInBundle) + LastInst = Bundle->Inst; + } + + // LastInst can still be null at this point if there's either not an entry + // for BB in BlocksSchedules or there's no ScheduleData available for + // VL.back(). This can be the case if buildTree_rec aborts for various + // reasons (e.g., the maximum recursion depth is reached, the maximum region + // size is reached, etc.). ScheduleData is initialized in the scheduling + // "dry-run". + // + // If this happens, we can still find the last instruction by brute force. We + // iterate forwards from Front (inclusive) until we either see all + // instructions in the bundle or reach the end of the block. If Front is the + // last instruction in program order, LastInst will be set to Front, and we + // will visit all the remaining instructions in the block. + // + // One of the reasons we exit early from buildTree_rec is to place an upper + // bound on compile-time. Thus, taking an additional compile-time hit here is + // not ideal. However, this should be exceedingly rare since it requires that + // we both exit early from buildTree_rec and that the bundle be out-of-order + // (causing us to iterate all the way to the end of the block). + if (!LastInst) { + SmallPtrSet Bundle(VL.begin(), VL.end()); + for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { + if (Bundle.erase(&I)) + LastInst = &I; + if (Bundle.empty()) + break; + } + } + + // Set the insertion point after the last instruction in the bundle. Set the + // debug location to Front. + Builder.SetInsertPoint(BB, next(BasicBlock::iterator(LastInst))); + Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); if (Instruction *Insrt = dyn_cast(Vec)) { GatherSeq.insert(Insrt); CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. if (ScalarToTreeEntry.count(VL[i])) { int Idx = ScalarToTreeEntry[VL[i]]; TreeEntry *E = &VectorizableTree[Idx]; // Find which lane we need to extract. int FoundLane = -1; for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { // Is this the lane of the scalar that we are looking for ? if (E->Scalars[Lane] == VL[i]) { FoundLane = Lane; break; } } assert(FoundLane >= 0 && "Could not find the correct lane"); ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); } } } return Vec; } Value *BoUpSLP::alreadyVectorized(ArrayRef VL) const { SmallDenseMap::const_iterator Entry = ScalarToTreeEntry.find(VL[0]); if (Entry != ScalarToTreeEntry.end()) { int Idx = Entry->second; const TreeEntry *En = &VectorizableTree[Idx]; if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } return nullptr; } Value *BoUpSLP::vectorizeTree(ArrayRef VL) { if (ScalarToTreeEntry.count(VL[0])) { int Idx = ScalarToTreeEntry[VL[0]]; TreeEntry *E = &VectorizableTree[Idx]; if (E->isSame(VL)) return vectorizeTree(E); } Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); return Gather(VL, VecTy); } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } Instruction *VL0 = cast(E->Scalars[0]); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); if (E->NeedToGather) { setInsertPointAfterBundle(E->Scalars); - return Gather(E->Scalars, VecTy); + auto *V = Gather(E->Scalars, VecTy); + E->VectorizedValue = V; + return V; } unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); E->VectorizedValue = NewPhi; // PHINodes may have multiple entries from the same block. We want to // visit every block once. SmallSet VisitedBBs; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; BasicBlock *IBB = PH->getIncomingBlock(i); if (!VisitedBBs.insert(IBB).second) { NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); continue; } // Prepare the operand vector. for (Value *V : E->Scalars) Operands.push_back(cast(V)->getIncomingValueForBlock(IBB)); Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); Value *Vec = vectorizeTree(Operands); NewPhi->addIncoming(Vec, IBB); } assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && "Invalid number of incoming values"); return NewPhi; } case Instruction::ExtractElement: { if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) { Value *V = VL0->getOperand(0); E->VectorizedValue = V; return V; } - return Gather(E->Scalars, VecTy); + setInsertPointAfterBundle(E->Scalars); + auto *V = Gather(E->Scalars, VecTy); + E->VectorizedValue = V; + return V; } case Instruction::ExtractValue: { if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) { LoadInst *LI = cast(VL0->getOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); E->VectorizedValue = V; return propagateMetadata(V, E->Scalars); } - return Gather(E->Scalars, VecTy); + setInsertPointAfterBundle(E->Scalars); + auto *V = Gather(E->Scalars, VecTy); + E->VectorizedValue = V; + return V; } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { ValueList INVL; for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); setInsertPointAfterBundle(E->Scalars); Value *InVec = vectorizeTree(INVL); if (Value *V = alreadyVectorized(E->Scalars)) return V; CastInst *CI = dyn_cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); E->VectorizedValue = V; ++NumVectorInstructions; return V; } case Instruction::FCmp: case Instruction::ICmp: { ValueList LHSV, RHSV; for (Value *V : E->Scalars) { LHSV.push_back(cast(V)->getOperand(0)); RHSV.push_back(cast(V)->getOperand(1)); } setInsertPointAfterBundle(E->Scalars); Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); if (Value *V = alreadyVectorized(E->Scalars)) return V; CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; if (Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); E->VectorizedValue = V; ++NumVectorInstructions; return V; } case Instruction::Select: { ValueList TrueVec, FalseVec, CondVec; for (Value *V : E->Scalars) { CondVec.push_back(cast(V)->getOperand(0)); TrueVec.push_back(cast(V)->getOperand(1)); FalseVec.push_back(cast(V)->getOperand(2)); } setInsertPointAfterBundle(E->Scalars); Value *Cond = vectorizeTree(CondVec); Value *True = vectorizeTree(TrueVec); Value *False = vectorizeTree(FalseVec); if (Value *V = alreadyVectorized(E->Scalars)) return V; Value *V = Builder.CreateSelect(Cond, True, False); E->VectorizedValue = V; ++NumVectorInstructions; return V; } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { LHSVL.push_back(cast(V)->getOperand(0)); RHSVL.push_back(cast(V)->getOperand(1)); } setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); if (LHS == RHS && isa(LHS)) { assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); } if (Value *V = alreadyVectorized(E->Scalars)) return V; BinaryOperator *BinOp = cast(VL0); Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); E->VectorizedValue = V; propagateIRFlags(E->VectorizedValue, E->Scalars); ++NumVectorInstructions; if (Instruction *I = dyn_cast(V)) return propagateMetadata(I, E->Scalars); return V; } case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. setInsertPointAfterBundle(E->Scalars); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo(AS)); // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the // future. if (ScalarToTreeEntry.count(LI->getPointerOperand())) ExternalUses.push_back( ExternalUser(LI->getPointerOperand(), cast(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); if (!Alignment) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; return propagateMetadata(LI, E->Scalars); } case Instruction::Store: { StoreInst *SI = cast(VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); ValueList ValueOp; for (Value *V : E->Scalars) ValueOp.push_back(cast(V)->getValueOperand()); setInsertPointAfterBundle(E->Scalars); Value *VecValue = vectorizeTree(ValueOp); Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the // future. if (ScalarToTreeEntry.count(SI->getPointerOperand())) ExternalUses.push_back( ExternalUser(SI->getPointerOperand(), cast(VecPtr), 0)); if (!Alignment) { Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); } S->setAlignment(Alignment); E->VectorizedValue = S; ++NumVectorInstructions; return propagateMetadata(S, E->Scalars); } case Instruction::GetElementPtr: { setInsertPointAfterBundle(E->Scalars); ValueList Op0VL; for (Value *V : E->Scalars) Op0VL.push_back(cast(V)->getOperand(0)); Value *Op0 = vectorizeTree(Op0VL); std::vector OpVecs; for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; ++j) { ValueList OpVL; for (Value *V : E->Scalars) OpVL.push_back(cast(V)->getOperand(j)); Value *OpVec = vectorizeTree(OpVL); OpVecs.push_back(OpVec); } Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); E->VectorizedValue = V; ++NumVectorInstructions; if (Instruction *I = dyn_cast(V)) return propagateMetadata(I, E->Scalars); return V; } case Instruction::Call: { CallInst *CI = cast(VL0); setInsertPointAfterBundle(E->Scalars); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; if (CI && (FI = CI->getCalledFunction())) { IID = FI->getIntrinsicID(); } std::vector OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; // ctlz,cttz and powi are special intrinsics whose second argument is // a scalar. This argument should not be vectorized. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { CallInst *CEI = cast(E->Scalars[0]); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); continue; } for (Value *V : E->Scalars) { CallInst *CEI = cast(V); OpVL.push_back(CEI->getArgOperand(j)); } Value *OpVec = vectorizeTree(OpVL); DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } Module *M = F->getParent(); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; Function *CF = Intrinsic::getDeclaration(M, ID, Tys); SmallVector OpBundles; CI->getOperandBundlesAsDefs(OpBundles); Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); // The scalar argument uses an in-tree scalar so we add the new vectorized // call to ExternalUses list to make sure that an extract will be // generated in the future. if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); E->VectorizedValue = V; ++NumVectorInstructions; return V; } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; assert(isa(VL0) && "Invalid Shuffle Vector Operand"); reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); if (Value *V = alreadyVectorized(E->Scalars)) return V; // Create a vector of LHS op1 RHS BinaryOperator *BinOp0 = cast(VL0); Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); // Create a vector of LHS op2 RHS Instruction *VL1 = cast(E->Scalars[1]); BinaryOperator *BinOp1 = cast(VL1); Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); // Create shuffle to take alternate operations from the vector. // Also, gather up odd and even scalar ops to propagate IR flags to // each vector operation. ValueList OddScalars, EvenScalars; unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { if (i & 1) { Mask[i] = Builder.getInt32(e + i); OddScalars.push_back(E->Scalars[i]); } else { Mask[i] = Builder.getInt32(i); EvenScalars.push_back(E->Scalars[i]); } } Value *ShuffleMask = ConstantVector::get(Mask); propagateIRFlags(V0, EvenScalars); propagateIRFlags(V1, OddScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); E->VectorizedValue = V; ++NumVectorInstructions; if (Instruction *I = dyn_cast(V)) return propagateMetadata(I, E->Scalars); return V; } default: llvm_unreachable("unknown inst"); } return nullptr; } Value *BoUpSLP::vectorizeTree() { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); } Builder.SetInsertPoint(&F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. auto *ScalarRoot = VectorizableTree[0].Scalars[0]; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); auto BundleWidth = VectorizableTree[0].Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); auto *VecTy = VectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); VectorizableTree[0].VectorizedValue = Trunc; } DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; llvm::User *User = ExternalUse.User; // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == Scalar->user_end()) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); int Idx = ScalarToTreeEntry[Scalar]; TreeEntry *E = &VectorizableTree[Idx]; assert(!E->NeedToGather && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. if (auto *VecI = dyn_cast(Vec)) { if (PHINode *PH = dyn_cast(User)) { for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { if (PH->getIncomingValue(i) == Scalar) { TerminatorInst *IncomingTerminator = PH->getIncomingBlock(i)->getTerminator(); if (isa(IncomingTerminator)) { Builder.SetInsertPoint(VecI->getParent(), std::next(VecI->getIterator())); } else { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); } Value *Ex = Builder.CreateExtractElement(Vec, Lane); if (MinBWs.count(ScalarRoot)) Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } } } else { Builder.SetInsertPoint(cast(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); if (MinBWs.count(ScalarRoot)) Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(cast(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); if (MinBWs.count(ScalarRoot)) Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } // For each vectorized value: for (TreeEntry &EIdx : VectorizableTree) { TreeEntry *Entry = &EIdx; // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; // No need to handle users of gathered values. if (Entry->NeedToGather) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); assert((ScalarToTreeEntry.count(U) || // It is legal to replace users in the ignorelist by undef. (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != UserIgnoreList.end())) && "Replacing out-of-tree value with undef"); } #endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); eraseInstruction(cast(Scalar)); } } Builder.ClearInsertionPoint(); return VectorizableTree[0].VectorizedValue; } void BoUpSLP::optimizeGatherSequence() { DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (Instruction *it : GatherSeq) { InsertElementInst *Insert = dyn_cast(it); if (!Insert) continue; // Check if this block is inside a loop. Loop *L = LI->getLoopFor(Insert->getParent()); if (!L) continue; // Check if it has a preheader. BasicBlock *PreHeader = L->getLoopPreheader(); if (!PreHeader) continue; // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); Instruction *NewElem = dyn_cast(Insert->getOperand(1)); if (CurrVec && L->contains(CurrVec)) continue; if (NewElem && L->contains(NewElem)) continue; // We can hoist this instruction. Move it to the pre-header. Insert->moveBefore(PreHeader->getTerminator()); } // Make a list of all reachable blocks in our CSE queue. SmallVector CSEWorkList; CSEWorkList.reserve(CSEBlocks.size()); for (BasicBlock *BB : CSEBlocks) if (DomTreeNode *N = DT->getNode(BB)) { assert(DT->isReachableFromEntry(N)); CSEWorkList.push_back(N); } // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), [this](const DomTreeNode *A, const DomTreeNode *B) { return DT->properlyDominates(A, B); }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector Visited; for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"); BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = &*it++; if (!isa(In) && !isa(In)) continue; // Check if we can replace this instruction with any of the // visited instructions. for (Instruction *v : Visited) { if (In->isIdenticalTo(v) && DT->dominates(v->getParent(), In->getParent())) { In->replaceAllUsesWith(v); eraseInstruction(In); In = nullptr; break; } } if (In) { assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); Visited.push_back(In); } } } CSEBlocks.clear(); GatherSeq.clear(); } // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP) { if (isa(VL[0])) return true; // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { if (!extendSchedulingRegion(V)) return false; } for (Value *V : VL) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); if (BundleMember->IsScheduled) { // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the // existing schedule. DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember << " was already scheduled\n"); ReSchedule = true; } assert(BundleMember->isSchedulingEntity() && "bundle member already part of other bundle"); if (PrevInBundle) { PrevInBundle->NextInBundle = BundleMember; } else { Bundle = BundleMember; } BundleMember->UnscheduledDepsInBundle = 0; Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; // Group the instructions to a bundle. BundleMember->FirstInBundle = Bundle; PrevInBundle = BundleMember; } if (ScheduleEnd != OldScheduleEnd) { // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to // recalculate all dependencies. // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { ScheduleData *SD = getScheduleData(I); SD->clearDependencies(); } ReSchedule = true; } if (ReSchedule) { resetSchedule(); initialFillReadyList(ReadyInsts); } DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); calculateDependencies(Bundle, true, SLP); // Now try to schedule the new bundle. As soon as the bundle is "ready" it // means that there are no cyclic dependencies and we can schedule it. // Note that's important that we don't "schedule" the bundle yet (see // cancelScheduling). while (!Bundle->isReady() && !ReadyInsts.empty()) { ScheduleData *pickedSD = ReadyInsts.back(); ReadyInsts.pop_back(); if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { schedule(pickedSD, ReadyInsts); } } if (!Bundle->isReady()) { cancelScheduling(VL); return false; } return true; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { if (isa(VL[0])) return; ScheduleData *Bundle = getScheduleData(VL[0]); DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && "tried to unbundle something which is not a bundle"); // Un-bundle: make single instructions out of the bundle. ScheduleData *BundleMember = Bundle; while (BundleMember) { assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; if (BundleMember->UnscheduledDepsInBundle == 0) { ReadyInsts.insert(BundleMember); } BundleMember = Next; } } bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (getScheduleData(V)) return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); if (!ScheduleStart) { // It's the first instruction in the new region. initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter(ScheduleEnd); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); return false; } if (UpIter != UpperEnd) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; } UpIter++; } if (DownIter != LowerEnd) { if (&*DownIter == I) { initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; } DownIter++; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); } return true; } void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore, ScheduleData *NextLoadStore) { ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { ScheduleData *SD = ScheduleDataMap[I]; if (!SD) { // Allocate a new ScheduleData for the instruction. if (ChunkPos >= ChunkSize) { ScheduleDataChunks.push_back( llvm::make_unique(ChunkSize)); ChunkPos = 0; } SD = &(ScheduleDataChunks.back()[ChunkPos++]); ScheduleDataMap[I] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID); if (I->mayReadOrWriteMemory()) { // Update the linked list of memory accessing instructions. if (CurrentLoadStore) { CurrentLoadStore->NextLoadStore = SD; } else { FirstLoadStoreInRegion = SD; } CurrentLoadStore = SD; } } if (NextLoadStore) { if (CurrentLoadStore) CurrentLoadStore->NextLoadStore = NextLoadStore; } else { LastLoadStoreInRegion = CurrentLoadStore; } } void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, bool InsertInReadyList, BoUpSLP *SLP) { assert(SD->isSchedulingEntity()); SmallVector WorkList; WorkList.push_back(SD); while (!WorkList.empty()) { ScheduleData *SD = WorkList.back(); WorkList.pop_back(); ScheduleData *BundleMember = SD; while (BundleMember) { assert(isInSchedulingRegion(BundleMember)); if (!BundleMember->hasValidDependencies()) { DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); BundleMember->Dependencies = 0; BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. for (User *U : BundleMember->Inst->users()) { if (isa(U)) { ScheduleData *UseSD = getScheduleData(U); if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; if (!DestBundle->IsScheduled) { BundleMember->incrementUnscheduledDeps(1); } if (!DestBundle->hasValidDependencies()) { WorkList.push_back(DestBundle); } } } else { // I'm not sure if this can ever happen. But we need to be safe. // This lets the instruction/bundle never be scheduled and // eventually disable vectorization. BundleMember->Dependencies++; BundleMember->incrementUnscheduledDeps(1); } } // Handle the memory dependencies. ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { Instruction *SrcInst = BundleMember->Inst; MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); unsigned numAliased = 0; unsigned DistToSrc = 1; while (DepDest) { assert(isInSchedulingRegion(DepDest)); // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to // SLP->isAliased (which is the expensive part in this loop). // 2) MaxMemDepDistance: It's for very large blocks and it aborts // the whole loop (even if the loop is fast, it's quadratic). // It's important for the loop break condition (see below) to // check this limit even between two read-only instructions. if (DistToSrc >= MaxMemDepDistance || ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && (numAliased >= AliasedCheckLimit || SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { // We increment the counter only if the locations are aliased // (instead of counting all alias checks). This gives a better // balance between reduced runtime and accurate dependencies. numAliased++; DepDest->MemoryDependencies.push_back(BundleMember); BundleMember->Dependencies++; ScheduleData *DestBundle = DepDest->FirstInBundle; if (!DestBundle->IsScheduled) { BundleMember->incrementUnscheduledDeps(1); } if (!DestBundle->hasValidDependencies()) { WorkList.push_back(DestBundle); } } DepDest = DepDest->NextLoadStore; // Example, explaining the loop break condition: Let's assume our // starting instruction is i0 and MaxMemDepDistance = 3. // // +--------v--v--v // i0,i1,i2,i3,i4,i5,i6,i7,i8 // +--------^--^--^ // // MaxMemDepDistance let us stop alias-checking at i3 and we add // dependencies from i0 to i3,i4,.. (even if they are not aliased). // Previously we already added dependencies from i3 to i6,i7,i8 // (because of MaxMemDepDistance). As we added a dependency from // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 // and we can abort this loop at i6. if (DistToSrc >= 2 * MaxMemDepDistance) break; DistToSrc++; } } } BundleMember = BundleMember->NextInBundle; } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); } } } void BoUpSLP::BlockScheduling::resetSchedule() { assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { ScheduleData *SD = getScheduleData(I); assert(isInSchedulingRegion(SD)); SD->IsScheduled = false; SD->resetUnscheduledDeps(); } ReadyInsts.clear(); } void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); BS->resetSchedule(); // For the real scheduling we use a more sophisticated ready-list: it is // sorted by the original instruction location. This lets the final schedule // be as close as possible to the original instruction order. struct ScheduleDataCompare { bool operator()(ScheduleData *SD1, ScheduleData *SD2) { return SD2->SchedulingPriority < SD1->SchedulingPriority; } }; std::set ReadyInsts; // Ensure that all dependency data is updated and fill the ready-list with // initial instructions. int Idx = 0; int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { ScheduleData *SD = BS->getScheduleData(I); assert( SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && "scheduler and vectorizer have different opinion on what is a bundle"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { BS->calculateDependencies(SD, false, this); NumToSchedule++; } } BS->initialFillReadyList(ReadyInsts); Instruction *LastScheduledInst = BS->ScheduleEnd; // Do the "real" scheduling. while (!ReadyInsts.empty()) { ScheduleData *picked = *ReadyInsts.begin(); ReadyInsts.erase(ReadyInsts.begin()); // Move the scheduled instruction(s) to their dedicated places, if not // there yet. ScheduleData *BundleMember = picked; while (BundleMember) { Instruction *pickedInst = BundleMember->Inst; if (LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst); } LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; } BS->schedule(picked, ReadyInsts); NumToSchedule--; } assert(NumToSchedule == 0 && "could not schedule all instructions"); // Avoid duplicate scheduling of the block. BS->ScheduleStart = nullptr; } unsigned BoUpSLP::getVectorElementSize(Value *V) { // If V is a store, just return the width of the stored value without // traversing the expression tree. This is the common case. if (auto *Store = dyn_cast(V)) return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); // If V is not a store, we can traverse the expression tree to find loads // that feed it. The type of the loaded value may indicate a more suitable // width than V's type. We want to base the vector element size on the width // of memory operations where possible. SmallVector Worklist; SmallPtrSet Visited; if (auto *I = dyn_cast(V)) Worklist.push_back(I); // Traverse the expression tree in bottom-up order looking for loads. If we // encounter an instruciton we don't yet handle, we give up. auto MaxWidth = 0u; auto FoundUnknownInst = false; while (!Worklist.empty() && !FoundUnknownInst) { auto *I = Worklist.pop_back_val(); Visited.insert(I); // We should only be looking at scalar instructions here. If the current // instruction has a vector type, give up. auto *Ty = I->getType(); if (isa(Ty)) FoundUnknownInst = true; // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. else if (isa(I)) MaxWidth = std::max(MaxWidth, DL->getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an // instruction we haven't yet visited, we add it to the worklist. else if (isa(I) || isa(I) || isa(I) || isa(I) || isa(I) || isa(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) if (!Visited.count(J)) Worklist.push_back(J); } // If we don't yet handle the instruction, give up. else FoundUnknownInst = true; } // If we didn't encounter a memory access in the expression tree, or if we // gave up for some reason, just return the width of V. if (!MaxWidth || FoundUnknownInst) return DL->getTypeSizeInBits(V->getType()); // Otherwise, return the maximum width we found. return MaxWidth; } // Determine if a value V in a vectorizable expression Expr can be demoted to a // smaller type with a truncation. We collect the values that will be demoted // in ToDemote and additional roots that require investigating in Roots. static bool collectValuesToDemote(Value *V, SmallPtrSetImpl &Expr, SmallVectorImpl &ToDemote, SmallVectorImpl &Roots) { // We can always demote constants. if (isa(V)) { ToDemote.push_back(V); return true; } // If the value is not an instruction in the expression with only one use, it // cannot be demoted. auto *I = dyn_cast(V); if (!I || !I->hasOneUse() || !Expr.count(I)) return false; switch (I->getOpcode()) { // We can always demote truncations and extensions. Since truncations can // seed additional demotion, we save the truncated value. case Instruction::Trunc: Roots.push_back(I->getOperand(0)); case Instruction::ZExt: case Instruction::SExt: break; // We can demote certain binary operations if we can demote both of their // operands. case Instruction::Add: case Instruction::Sub: case Instruction::Mul: case Instruction::And: case Instruction::Or: case Instruction::Xor: if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) return false; break; // We can demote selects if we can demote their true and false values. case Instruction::Select: { SelectInst *SI = cast(I); if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) return false; break; } // We can demote phis if we can demote all their incoming operands. Note that // we don't need to worry about cycles since we ensure single use above. case Instruction::PHI: { PHINode *PN = cast(I); for (Value *IncValue : PN->incoming_values()) if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) return false; break; } // Otherwise, conservatively give up. default: return false; } // Record the value that we can demote. ToDemote.push_back(V); return true; } void BoUpSLP::computeMinimumValueSizes() { // If there are no external uses, the expression tree must be rooted by a // store. We can't demote in-memory values, so there is nothing to do here. if (ExternalUses.empty()) return; // We only attempt to truncate integer expressions. auto &TreeRoot = VectorizableTree[0].Scalars; auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); if (!TreeRootIT) return; // If the expression is not rooted by a store, these roots should have // external uses. We will rely on InstCombine to rewrite the expression in // the narrower type. However, InstCombine only rewrites single-use values. // This means that if a tree entry other than a root is used externally, it // must have multiple uses and InstCombine will not rewrite it. The code // below ensures that only the roots are used externally. SmallPtrSet Expr(TreeRoot.begin(), TreeRoot.end()); for (auto &EU : ExternalUses) if (!Expr.erase(EU.Scalar)) return; if (!Expr.empty()) return; // Collect the scalar values of the vectorizable expression. We will use this // context to determine which values can be demoted. If we see a truncation, // we mark it as seeding another demotion. for (auto &Entry : VectorizableTree) Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end()); // Ensure the roots of the vectorizable tree don't form a cycle. They must // have a single external user that is not in the vectorizable tree. for (auto *Root : TreeRoot) if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) return; // Conservatively determine if we can actually truncate the roots of the // expression. Collect the values that can be demoted in ToDemote and // additional roots that require investigating in Roots. SmallVector ToDemote; SmallVector Roots; for (auto *Root : TreeRoot) if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) return; // The maximum bit width required to represent all the values that can be // demoted without loss of precision. It would be safe to truncate the roots // of the expression to this width. auto MaxBitWidth = 8u; // We first check if all the bits of the roots are demanded. If they're not, // we can truncate the roots to this narrower type. for (auto *Root : TreeRoot) { auto Mask = DB->getDemandedBits(cast(Root)); MaxBitWidth = std::max( Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); } // If all the bits of the roots are demanded, we can try a little harder to // compute a narrower type. This can happen, for example, if the roots are // getelementptr indices. InstCombine promotes these indices to the pointer // width. Thus, all their bits are technically demanded even though the // address computation might be vectorized in a smaller type. // // We start by looking at each entry that can be demoted. We compute the // maximum bit width required to store the scalar by using ValueTracking to // compute the number of high-order bits we can truncate. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) { MaxBitWidth = 8u; for (auto *Scalar : ToDemote) { auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT); auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); MaxBitWidth = std::max(NumTypeBits - NumSignBits, MaxBitWidth); } } // Round MaxBitWidth up to the next power-of-two. if (!isPowerOf2_64(MaxBitWidth)) MaxBitWidth = NextPowerOf2(MaxBitWidth); // If the maximum bit width we compute is less than the with of the roots' // type, we can proceed with the narrowing. Otherwise, do nothing. if (MaxBitWidth >= TreeRootIT->getBitWidth()) return; // If we can truncate the root, we must collect additional values that might // be demoted as a result. That is, those seeded by truncations we will // modify. while (!Roots.empty()) collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); // Finally, map the values we can demote to the maximum bit with we computed. for (auto *Scalar : ToDemote) MinBWs[Scalar] = MaxBitWidth; } namespace { /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { SLPVectorizerPass Impl; /// Pass identification, replacement for typeid static char ID; explicit SLPVectorizer() : FunctionPass(ID) { initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); } bool doInitialization(Module &M) override { return false; } bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; auto *SE = &getAnalysis().getSE(); auto *TTI = &getAnalysis().getTTI(F); auto *TLIP = getAnalysisIfAvailable(); auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; auto *AA = &getAnalysis().getAAResults(); auto *LI = &getAnalysis().getLoopInfo(); auto *DT = &getAnalysis().getDomTree(); auto *AC = &getAnalysis().getAssumptionCache(F); auto *DB = &getAnalysis().getDemandedBits(); return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); } void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.setPreservesCFG(); } }; } // end anonymous namespace PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { auto *SE = &AM.getResult(F); auto *TTI = &AM.getResult(F); auto *TLI = AM.getCachedResult(F); auto *AA = &AM.getResult(F); auto *LI = &AM.getResult(F); auto *DT = &AM.getResult(F); auto *AC = &AM.getResult(F); auto *DB = &AM.getResult(F); bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve(); PA.preserve(); PA.preserve(); PA.preserve(); return PA; } bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_) { SE = SE_; TTI = TTI_; TLI = TLI_; AA = AA_; LI = LI_; DT = DT_; AC = AC_; DB = DB_; DL = &F.getParent()->getDataLayout(); Stores.clear(); GEPs.clear(); bool Changed = false; // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(true)) return false; // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); // Use the bottom up slp vectorizer to construct chains that start with // store instructions. BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); // Vectorize trees that end at stores. if (!Stores.empty()) { DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() << " underlying objects.\n"); Changed |= vectorizeStoreChains(R); } // Vectorize trees that end at reductions. Changed |= vectorizeChainsInBlock(BB, R); // Vectorize the index computations of getelementptr instructions. This // is primarily intended to catch gather-like idioms ending at // non-consecutive loads. if (!GEPs.empty()) { DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() << " underlying objects.\n"); Changed |= vectorizeGEPIndices(BB, R); } } if (Changed) { R.optimizeGatherSequence(); DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); } return Changed; } /// \brief Check that the Values in the slice in VL array are still existent in /// the WeakVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakVH array. static bool hasValueBeenRAUWed(ArrayRef VL, ArrayRef VH, unsigned SliceBegin, unsigned SliceSize) { VL = VL.slice(SliceBegin, SliceSize); VH = VH.slice(SliceBegin, SliceSize); return !std::equal(VL.begin(), VL.end(), VH.begin()); } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, int CostThreshold, BoUpSLP &R, unsigned VecRegSize) { unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); unsigned Sz = R.getVectorElementSize(Chain[0]); unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; // Keep track of values that were deleted by vectorizing in the loop below. SmallVector TrackValues(Chain.begin(), Chain.end()); bool Changed = false; // Look for profitable vectorizable trees at all offsets, starting at zero. for (unsigned i = 0, e = ChainLen; i < e; ++i) { if (i + VF > e) break; // Check that a previous iteration of this loop did not delete the Value. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) continue; DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i << "\n"); ArrayRef Operands = Chain.slice(i, VF); R.buildTree(Operands); R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < CostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); R.vectorizeTree(); // Move to the next bundle. i += VF - 1; Changed = true; } } return Changed; } bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, int costThreshold, BoUpSLP &R) { SetVector Heads, Tails; SmallDenseMap ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. BoUpSLP::ValueSet VectorizedStores; bool Changed = false; // Do a quadratic search on all of the given stores and find // all of the pairs of stores that follow each other. SmallVector IndexQueue; for (unsigned i = 0, e = Stores.size(); i < e; ++i) { IndexQueue.clear(); // If a store has multiple consecutive store candidates, search Stores // array according to the sequence: from i+1 to e, then from i-1 to 0. // This is because usually pairing with immediate succeeding or preceding // candidate create the best chance to find slp vectorization opportunity. unsigned j = 0; for (j = i + 1; j < e; ++j) IndexQueue.push_back(j); for (j = i; j > 0; --j) IndexQueue.push_back(j - 1); for (auto &k : IndexQueue) { if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) { Tails.insert(Stores[k]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[k]; break; } } } // For stores that start but don't end a link in the chain: for (SetVector::iterator it = Heads.begin(), e = Heads.end(); it != e; ++it) { if (Tails.count(*it)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; StoreInst *I = *it; // Collect the chain into a list. while (Tails.count(I) || Heads.count(I)) { if (VectorizedStores.count(I)) break; Operands.push_back(I); // Move to the next value in the chain. I = ConsecutiveChain[I]; } // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) { if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Operands.begin(), Operands.end()); Changed = true; break; } } } return Changed; } void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { // Initialize the collections. We will make a single pass over the block. Stores.clear(); GEPs.clear(); // Visit the store and getelementptr instructions in BB and organize them in // Stores and GEPs according to the underlying objects of their pointer // operands. for (Instruction &I : *BB) { // Ignore store instructions that are volatile or have a pointer operand // that doesn't point to a scalar type. if (auto *SI = dyn_cast(&I)) { if (!SI->isSimple()) continue; if (!isValidElementType(SI->getValueOperand()->getType())) continue; Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI); } // Ignore getelementptr instructions that have more than one index, a // constant index, or a pointer operand that doesn't point to a scalar // type. else if (auto *GEP = dyn_cast(&I)) { auto Idx = GEP->idx_begin()->get(); if (GEP->getNumIndices() > 1 || isa(Idx)) continue; if (!isValidElementType(Idx->getType())) continue; if (GEP->getType()->isVectorTy()) continue; GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP); } } } bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; return tryToVectorizeList(VL, R, None, true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, ArrayRef BuildVector, bool allowReorder) { if (VL.size() < 2) return false; DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); // Check that all of the parts are scalar instructions of the same type. Instruction *I0 = dyn_cast(VL[0]); if (!I0) return false; unsigned Opcode0 = I0->getOpcode(); // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. unsigned Sz = R.getVectorElementSize(I0); unsigned VF = R.getMinVecRegSize() / Sz; for (Value *V : VL) { Type *Ty = V->getType(); if (!isValidElementType(Ty)) return false; Instruction *Inst = dyn_cast(V); if (!Inst || Inst->getOpcode() != Opcode0) return false; } bool Changed = false; // Keep track of values that were deleted by vectorizing in the loop below. SmallVector TrackValues(VL.begin(), VL.end()); for (unsigned i = 0, e = VL.size(); i < e; ++i) { unsigned OpsWidth = 0; if (i + VF > e) OpsWidth = e - i; else OpsWidth = VF; if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) break; // Check that a previous iteration of this loop did not delete the Value. if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) continue; DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); ArrayRef Ops = VL.slice(i, OpsWidth); ArrayRef BuildVectorSlice; if (!BuildVector.empty()) BuildVectorSlice = BuildVector.slice(i, OpsWidth); R.buildTree(Ops, BuildVectorSlice); // TODO: check if we can allow reordering also for other cases than // tryToVectorizePair() if (allowReorder && R.shouldReorder()) { assert(Ops.size() == 2); assert(BuildVectorSlice.empty()); Value *ReorderedOps[] = { Ops[1], Ops[0] }; R.buildTree(ReorderedOps, None); } R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); Value *VectorizedRoot = R.vectorizeTree(); // Reconstruct the build vector by extracting the vectorized root. This // way we handle the case where some elements of the vector are undefined. // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) if (!BuildVectorSlice.empty()) { // The insert point is the last build vector instruction. The vectorized // root will precede it. This guarantees that we get an instruction. The // vectorized tree could have been constant folded. Instruction *InsertAfter = cast(BuildVectorSlice.back()); unsigned VecIdx = 0; for (auto &V : BuildVectorSlice) { IRBuilder Builder(InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); Instruction *I = cast(V); assert(isa(I) || isa(I)); Instruction *Extract = cast(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); I->setOperand(1, Extract); I->removeFromParent(); I->insertAfter(Extract); InsertAfter = I; } } // Move to the next bundle. i += VF - 1; Changed = true; } } return Changed; } bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; // Try to vectorize V. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) return true; BinaryOperator *A = dyn_cast(V->getOperand(0)); BinaryOperator *B = dyn_cast(V->getOperand(1)); // Try to skip B. if (B && B->hasOneUse()) { BinaryOperator *B0 = dyn_cast(B->getOperand(0)); BinaryOperator *B1 = dyn_cast(B->getOperand(1)); if (tryToVectorizePair(A, B0, R)) { return true; } if (tryToVectorizePair(A, B1, R)) { return true; } } // Try to skip A. if (A && A->hasOneUse()) { BinaryOperator *A0 = dyn_cast(A->getOperand(0)); BinaryOperator *A1 = dyn_cast(A->getOperand(1)); if (tryToVectorizePair(A0, B, R)) { return true; } if (tryToVectorizePair(A1, B, R)) { return true; } } return 0; } /// \brief Generate a shuffle mask to be used in a reduction tree. /// /// \param VecLen The length of the vector to be reduced. /// \param NumEltsToRdx The number of elements that should be reduced in the /// vector. /// \param IsPairwise Whether the reduction is a pairwise or splitting /// reduction. A pairwise reduction will generate a mask of /// <0,2,...> or <1,3,..> while a splitting reduction will generate /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. /// \param IsLeft True will generate a mask of even elements, odd otherwise. static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, bool IsPairwise, bool IsLeft, IRBuilder<> &Builder) { assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); SmallVector ShuffleMask( VecLen, UndefValue::get(Builder.getInt32Ty())); if (IsPairwise) // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). for (unsigned i = 0; i != NumEltsToRdx; ++i) ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); else // Move the upper half of the vector to the lower half. for (unsigned i = 0; i != NumEltsToRdx; ++i) ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); return ConstantVector::get(ShuffleMask); } /// Model horizontal reductions. /// /// A horizontal reduction is a tree of reduction operations (currently add and /// fadd) that has operations that can be put into a vector as its leaf. /// For example, this tree: /// /// mul mul mul mul /// \ / \ / /// + + /// \ / /// + /// This tree has "mul" as its reduced values and "+" as its reduction /// operations. A reduction might be feeding into a store or a binary operation /// feeding a phi. /// ... /// \ / /// + /// | /// phi += /// /// Or: /// ... /// \ / /// + /// | /// *p = /// class HorizontalReduction { SmallVector ReductionOps; SmallVector ReducedVals; BinaryOperator *ReductionRoot; PHINode *ReductionPHI; /// The opcode of the reduction. unsigned ReductionOpcode; /// The opcode of the values we perform a reduction on. unsigned ReducedValueOpcode; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. bool IsPairwiseReduction; public: /// The width of one full horizontal reduction operation. unsigned ReduxWidth; /// Minimal width of available vector registers. It's used to determine /// ReduxWidth. unsigned MinVecRegSize; HorizontalReduction(unsigned MinVecRegSize) : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0), MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); // We could have a initial reductions that is not an add. // r *= v1 + v2 + v3 + v4 // In such a case start looking for a tree rooted in the first '+'. if (Phi) { if (B->getOperand(0) == Phi) { Phi = nullptr; B = dyn_cast(B->getOperand(1)); } else if (B->getOperand(1) == Phi) { Phi = nullptr; B = dyn_cast(B->getOperand(0)); } } if (!B) return false; Type *Ty = B->getType(); if (!isValidElementType(Ty)) return false; const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; if (ReduxWidth < 4) return false; // We currently only support adds. if (ReductionOpcode != Instruction::Add && ReductionOpcode != Instruction::FAdd) return false; // Post order traverse the reduction tree starting at B. We only handle true // trees containing only binary operators or selects. SmallVector, 32> Stack; Stack.push_back(std::make_pair(B, 0)); while (!Stack.empty()) { Instruction *TreeN = Stack.back().first; unsigned EdgeToVist = Stack.back().second++; bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; // Only handle trees in the current basic block. if (TreeN->getParent() != B->getParent()) return false; // Each tree node needs to have one user except for the ultimate // reduction. if (!TreeN->hasOneUse() && TreeN != B) return false; // Postorder vist. if (EdgeToVist == 2 || IsReducedValue) { if (IsReducedValue) { // Make sure that the opcodes of the operations that we are going to // reduce match. if (!ReducedValueOpcode) ReducedValueOpcode = TreeN->getOpcode(); else if (ReducedValueOpcode != TreeN->getOpcode()) return false; ReducedVals.push_back(TreeN); } else { // We need to be able to reassociate the adds. if (!TreeN->isAssociative()) return false; ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); continue; } // Visit left or right. Value *NextV = TreeN->getOperand(EdgeToVist); // We currently only allow BinaryOperator's and SelectInst's as reduction // values in our tree. if (isa(NextV) || isa(NextV)) Stack.push_back(std::make_pair(cast(NextV), 0)); else if (NextV != Phi) return false; } return true; } /// \brief Attempt to vectorize the tree found by /// matchAssociativeReduction. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { if (ReducedVals.empty()) return false; unsigned NumReducedVals = ReducedVals.size(); if (NumReducedVals < ReduxWidth) return false; Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; Unsafe.setUnsafeAlgebra(); Builder.setFastMathFlags(Unsafe); unsigned i = 0; for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); V.computeMinimumValueSizes(); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); if (Cost >= -SLPCostThreshold) break; DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); // Vectorize a tree. DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc(); Value *VectorizedRoot = V.vectorizeTree(); // Emit a reduction. Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, ReducedSubTree, "bin.rdx"); } else VectorizedTree = ReducedSubTree; } if (VectorizedTree) { // Finish the reduction. for (; i < NumReducedVals; ++i) { Builder.SetCurrentDebugLocation( cast(ReducedVals[i])->getDebugLoc()); VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, ReducedVals[i]); } // Update users. if (ReductionPHI) { assert(ReductionRoot && "Need a reduction operation"); ReductionRoot->setOperand(0, VectorizedTree); ReductionRoot->setOperand(1, ReductionPHI); } else ReductionRoot->replaceAllUsesWith(VectorizedTree); } return VectorizedTree != nullptr; } unsigned numReductionValues() const { return ReducedVals.size(); } private: /// \brief Calculate the cost of a reduction. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; int ScalarReduxCost = ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost << " for reduction that starts with " << *FirstReducedVal << " (It is a " << (IsPairwiseReduction ? "pairwise" : "splitting") << " reduction)\n"); return VecReduxCost - ScalarReduxCost; } static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, Value *R, const Twine &Name = "") { if (Opcode == Instruction::FAdd) return Builder.CreateFAdd(L, R, Name); return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); } /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { if (IsPairwiseReduction) { Value *LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true, Builder); Value *RightMask = createRdxShuffleMask(ReduxWidth, i, true, false, Builder); Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); } else { Value *UpperHalf = createRdxShuffleMask(ReduxWidth, i, false, false, Builder); Value *Shuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); } } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } }; /// \brief Recognize construction of vectors like /// %ra = insertelement <4 x float> undef, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 /// /// Returns true if it matches /// static bool findBuildVector(InsertElementInst *FirstInsertElem, SmallVectorImpl &BuildVector, SmallVectorImpl &BuildVectorOpds) { if (!isa(FirstInsertElem->getOperand(0))) return false; InsertElementInst *IE = FirstInsertElem; while (true) { BuildVector.push_back(IE); BuildVectorOpds.push_back(IE->getOperand(1)); if (IE->use_empty()) return false; InsertElementInst *NextUse = dyn_cast(IE->user_back()); if (!NextUse) return true; // If this isn't the final use, make sure the next insertelement is the only // use. It's OK if the final constructed vector is used multiple times if (!IE->hasOneUse()) return false; IE = NextUse; } return false; } /// \brief Like findBuildVector, but looks backwards for construction of aggregate. /// /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl &BuildVector, SmallVectorImpl &BuildVectorOpds) { if (!IV->hasOneUse()) return false; Value *V = IV->getAggregateOperand(); if (!isa(V)) { InsertValueInst *I = dyn_cast(V); if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds)) return false; } BuildVector.push_back(IV); BuildVectorOpds.push_back(IV->getInsertedValueOperand()); return true; } static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } /// \brief Try and get a reduction value from a phi node. /// /// Given a phi node \p P in a block \p ParentBB, consider possible reductions /// if they come from either \p ParentBB or a containing loop latch. /// /// \returns A candidate reduction value if possible, or \code nullptr \endcode /// if not possible. static Value *getReductionValue(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI) { // There are situations where the reduction value is not dominated by the // reduction phi. Vectorizing such cases has been reported to cause // miscompiles. See PR25787. auto DominatedReduxValue = [&](Value *R) { return ( dyn_cast(R) && DT->dominates(P->getParent(), dyn_cast(R)->getParent())); }; Value *Rdx = nullptr; // Return the incoming value if it comes from the same BB as the phi node. if (P->getIncomingBlock(0) == ParentBB) { Rdx = P->getIncomingValue(0); } else if (P->getIncomingBlock(1) == ParentBB) { Rdx = P->getIncomingValue(1); } if (Rdx && DominatedReduxValue(Rdx)) return Rdx; // Otherwise, check whether we have a loop latch to look at. Loop *BBL = LI->getLoopFor(ParentBB); if (!BBL) return nullptr; BasicBlock *BBLatch = BBL->getLoopLatch(); if (!BBLatch) return nullptr; // There is a loop latch, return the incoming value if it comes from // that. This reduction pattern occassionaly turns up. if (P->getIncomingBlock(0) == BBLatch) { Rdx = P->getIncomingValue(0); } else if (P->getIncomingBlock(1) == BBLatch) { Rdx = P->getIncomingValue(1); } if (Rdx && DominatedReduxValue(Rdx)) return Rdx; return nullptr; } /// \brief Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding /// the phi node P with reduction operators BI, then check if it /// can be done. /// \returns true if a horizontal reduction was matched and reduced. /// \returns false if a horizontal reduction was not matched. static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, BoUpSLP &R, TargetTransformInfo *TTI, unsigned MinRegSize) { if (!ShouldVectorizeHor) return false; HorizontalReduction HorRdx(MinRegSize); if (!HorRdx.matchAssociativeReduction(P, BI)) return false; // If there is a sufficient number of reduction values, reduce // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. HorRdx.ReduxWidth = std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); return HorRdx.tryToReduce(R, TTI); } bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector Incoming; SmallSet VisitedInstrs; bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { HaveVectorizedPhiNodes = false; // Collect the incoming values from the PHIs. Incoming.clear(); for (Instruction &I : *BB) { PHINode *P = dyn_cast(&I); if (!P) break; if (!VisitedInstrs.count(P)) Incoming.push_back(P); } // Sort by type. std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); // Try to vectorize elements base on their type. for (SmallVector::iterator IncIt = Incoming.begin(), E = Incoming.end(); IncIt != E;) { // Look for the next elements with the same type. SmallVector::iterator SameTypeIt = IncIt; while (SameTypeIt != E && (*SameTypeIt)->getType() == (*IncIt)->getType()) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } // Try to vectorize them. unsigned NumElts = (SameTypeIt - IncIt); DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; break; } // Start over at the next instruction of a different type (or the end). IncIt = SameTypeIt; } } VisitedInstrs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) continue; if (isa(it)) continue; // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast(it)) { // Check that the PHI is a reduction PHI. if (P->getNumIncomingValues() != 2) return Changed; Value *Rdx = getReductionValue(DT, P, BB, LI); // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null(Rdx); if (!BI) continue; // Try to match and vectorize a horizontal reduction. if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) { Changed = true; it = BB->begin(); e = BB->end(); continue; } Value *Inst = BI->getOperand(0); if (Inst == P) Inst = BI->getOperand(1); if (tryToVectorize(dyn_cast(Inst), R)) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; it = BB->begin(); e = BB->end(); continue; } continue; } if (ShouldStartVectorizeHorAtStore) if (StoreInst *SI = dyn_cast(it)) if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, R.getMinVecRegSize()) || tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); continue; } } // Try to vectorize horizontal reductions feeding into a return. if (ReturnInst *RI = dyn_cast(it)) if (RI->getNumOperands() != 0) if (BinaryOperator *BinOp = dyn_cast(RI->getOperand(0))) { DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); if (tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1), R)) { Changed = true; it = BB->begin(); e = BB->end(); continue; } } // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast(it)) { if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { Changed = true; // We would like to start over since some instructions are deleted // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); continue; } for (int i = 0; i < 2; ++i) { if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) { if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { Changed = true; // We would like to start over since some instructions are deleted // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); break; } } } continue; } // Try to vectorize trees that start at insertelement instructions. if (InsertElementInst *FirstInsertElem = dyn_cast(it)) { SmallVector BuildVector; SmallVector BuildVectorOpds; if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; // Vectorize starting with the build vector operands ignoring the // BuildVector instructions for the purpose of scheduling and user // extraction. if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); } continue; } // Try to vectorize trees that start at insertvalue instructions feeding into // a store. if (StoreInst *SI = dyn_cast(it)) { if (InsertValueInst *LastInsertValue = dyn_cast(SI->getValueOperand())) { const DataLayout &DL = BB->getModule()->getDataLayout(); if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) { SmallVector BuildVector; SmallVector BuildVectorOpds; if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) continue; DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n"); if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) { Changed = true; it = BB->begin(); e = BB->end(); } continue; } } } } return Changed; } bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { auto Changed = false; for (auto &Entry : GEPs) { // If the getelementptr list has fewer than two elements, there's nothing // to do. if (Entry.second.size() < 2) continue; DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " << Entry.second.size() << ".\n"); // We process the getelementptr list in chunks of 16 (like we do for // stores) to minimize compile-time. for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) { auto Len = std::min(BE - BI, 16); auto GEPList = makeArrayRef(&Entry.second[BI], Len); // Initialize a set a candidate getelementptrs. Note that we use a // SetVector here to preserve program order. If the index computations // are vectorizable and begin with loads, we want to minimize the chance // of having to reorder them later. SetVector Candidates(GEPList.begin(), GEPList.end()); // Some of the candidates may have already been vectorized after we // initially collected them. If so, the WeakVHs will have nullified the // values, so remove them from the set of candidates. Candidates.remove(nullptr); // Remove from the set of candidates all pairs of getelementptrs with // constant differences. Such getelementptrs are likely not good // candidates for vectorization in a bottom-up phase since one can be // computed from the other. We also ensure all candidate getelementptr // indices are unique. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { auto *GEPI = cast(GEPList[I]); if (!Candidates.count(GEPI)) continue; auto *SCEVI = SE->getSCEV(GEPList[I]); for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { auto *GEPJ = cast(GEPList[J]); auto *SCEVJ = SE->getSCEV(GEPList[J]); if (isa(SE->getMinusSCEV(SCEVI, SCEVJ))) { Candidates.remove(GEPList[I]); Candidates.remove(GEPList[J]); } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { Candidates.remove(GEPList[J]); } } } // We break out of the above computation as soon as we know there are // fewer than two candidates remaining. if (Candidates.size() < 2) continue; // Add the single, non-constant index of each candidate to the bundle. We // ensured the indices met these constraints when we originally collected // the getelementptrs. SmallVector Bundle(Candidates.size()); auto BundleIndex = 0u; for (auto *V : Candidates) { auto *GEP = cast(V); auto *GEPIdx = GEP->idx_begin()->get(); assert(GEP->getNumIndices() == 1 || !isa(GEPIdx)); Bundle[BundleIndex++] = GEPIdx; } // Try and vectorize the indices. We are currently only interested in // gather-like cases of the form: // // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... // // where the loads of "a", the loads of "b", and the subtractions can be // performed in parallel. It's likely that detecting this pattern in a // bottom-up phase will be simpler and less costly than building a // full-blown top-down phase beginning at the consecutive loads. Changed |= tryToVectorizeList(Bundle, R); } } return Changed; } bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; ++it) { if (it->second.size() < 2) continue; DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); // Process the stores in chunks of 16. // TODO: The limit of 16 inhibits greater vectorization factors. // For example, AVX2 supports v32i8. Increasing this limit, however, // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min(CE - CI, 16); Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), -SLPCostThreshold, R); } } return Changed; } char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } } Index: projects/clang390-import/contrib/llvm =================================================================== --- projects/clang390-import/contrib/llvm (revision 304769) +++ projects/clang390-import/contrib/llvm (revision 304770) Property changes on: projects/clang390-import/contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/llvm/dist:r304310-304769